From 7538ddf8d3213340fae189853966c7302cc1540a Mon Sep 17 00:00:00 2001 From: zhushuang <974198603@qq.com> Date: Tue, 13 Jan 2026 10:07:07 +0800 Subject: [PATCH 1/2] issue/899 - fix: fix causal_softmax and rearrange bug --- .../moore/causal_softmax_kernel.h | 2 +- .../ops/rearrange/moore/rearrange_kernel.h | 37 +-- .../ops/rearrange/moore/rearrange_moore.mu | 238 +++++++----------- 3 files changed, 102 insertions(+), 175 deletions(-) diff --git a/src/infiniop/ops/causal_softmax/moore/causal_softmax_kernel.h b/src/infiniop/ops/causal_softmax/moore/causal_softmax_kernel.h index 19d364552..111e7cd4c 100644 --- a/src/infiniop/ops/causal_softmax/moore/causal_softmax_kernel.h +++ b/src/infiniop/ops/causal_softmax/moore/causal_softmax_kernel.h @@ -28,7 +28,7 @@ __device__ void causalSoftmaxKernel( // 1 | * * * ... * * | // 2 | * * * ... * * * | // height: 3 col_id-> - if (width + blockIdx.x >= threadIdx.x + height) { + if (width + blockIdx.x >= col + height) { if constexpr (std::is_same_v || std::is_same_v) { /* * MUSA does not support CUDA's native `hexp` function. diff --git a/src/infiniop/ops/rearrange/moore/rearrange_kernel.h b/src/infiniop/ops/rearrange/moore/rearrange_kernel.h index b62bd507c..b11084015 100644 --- a/src/infiniop/ops/rearrange/moore/rearrange_kernel.h +++ b/src/infiniop/ops/rearrange/moore/rearrange_kernel.h @@ -31,7 +31,7 @@ struct Constraint { // 定义宏生成内核函数 #define DEFINE_REARRANGE_KERNEL(Tmem_type, constraint_num, block_array_size, grid_array_size) \ - extern "C" __global__ void rearrange_unit_##Tmem_type##_block_##block_array_size##_grid_##grid_array_size##_constrain_##constraint_num( \ + extern "C" INFINIOP_MOORE_KERNEL rearrange_unit_##Tmem_type##_block_##block_array_size##_grid_##grid_array_size##_constrain_##constraint_num( \ void *__restrict__ dst, \ const void *__restrict__ src, \ const size_t block_dim, \ @@ -64,13 +64,13 @@ struct Constraint { size_t remaining \ = blockIdx.x; \ \ - for (ssize_t i = grid_array_size - 1; i >= 0; i--) { \ + for (ptrdiff_t i = grid_array_size - 1; i >= 0; i--) { \ size_t idx = remaining % grid_len.a[i]; \ remaining /= grid_len.a[i]; \ src_offset += idx * src_grid_stride.a[i]; \ dst_offset += idx * dst_grid_stride.a[i]; \ if (constraint_num > 0) { \ - for (ssize_t j = 0; j < constraint_num; j++) { \ + for (ptrdiff_t j = 0; j < constraint_num; j++) { \ if (i == constraints.a[j].grid_idx) { \ constraints_grid_idx_multiple[j] = idx * constraints.a[j].grid_div_block; \ } \ @@ -81,7 +81,7 @@ struct Constraint { /* 将结果存入共享内存 */ \ shared_src_offset = src_offset; \ shared_dst_offset = dst_offset; \ - for (ssize_t j = 0; j < constraint_num; j++) { \ + for (ptrdiff_t j = 0; j < constraint_num; j++) { \ shared_constraints_grid_idx_multiple[j] = constraints_grid_idx_multiple[j]; \ } \ } \ @@ -93,18 +93,18 @@ struct Constraint { ptrdiff_t src_offset = shared_src_offset; \ ptrdiff_t dst_offset = shared_dst_offset; \ ARRAY_TYPE_SIZE constraints_grid_idx_multiple[constraint_num > 0 ? constraint_num : 1]; \ - for (ssize_t j = 0; j < constraint_num; j++) { \ + for (ptrdiff_t j = 0; j < constraint_num; j++) { \ constraints_grid_idx_multiple[j] = shared_constraints_grid_idx_multiple[j]; \ } \ \ - for (ssize_t i = block_array_size - 1; i >= 0; i--) { \ + for (ptrdiff_t i = block_array_size - 1; i >= 0; i--) { \ size_t idx = remaining % block_len.a[i]; \ remaining /= block_len.a[i]; \ /* 计算偏移量 */ \ src_offset += idx * src_block_stride.a[i]; \ dst_offset += idx * dst_block_stride.a[i]; \ if (constraint_num > 0) { \ - for (ssize_t j = 0; j < constraint_num; j++) { \ + for (ptrdiff_t j = 0; j < constraint_num; j++) { \ if (i == constraints.a[j].block_idx) { \ if (constraints_grid_idx_multiple[j] + idx >= constraints.a[j].total_len) { \ return; \ @@ -116,7 +116,7 @@ struct Constraint { \ src_offset += remaining * src_block_stride.a[0]; \ dst_offset += remaining * dst_block_stride.a[0]; \ - for (ssize_t j = 0; j < constraint_num; j++) { \ + for (ptrdiff_t j = 0; j < constraint_num; j++) { \ if (0 == constraints.a[j].block_idx) { \ if (constraints_grid_idx_multiple[j] + remaining >= constraints.a[j].total_len) { \ return; \ @@ -134,7 +134,7 @@ struct Constraint { ptrdiff_t dst_offset = 0; \ size_t remaining = blockIdx.x; \ \ - for (ssize_t i = grid_array_size - 1; i >= 0; i--) { \ + for (ptrdiff_t i = grid_array_size - 1; i >= 0; i--) { \ size_t idx = remaining % grid_len.a[i]; \ remaining /= grid_len.a[i]; \ src_offset += idx * src_grid_stride.a[i]; \ @@ -153,7 +153,7 @@ struct Constraint { ptrdiff_t src_offset = shared_src_offset; \ ptrdiff_t dst_offset = shared_dst_offset; \ \ - for (ssize_t i = block_array_size - 1; i > 0; i--) { \ + for (ptrdiff_t i = block_array_size - 1; i > 0; i--) { \ size_t idx = remaining % block_len.a[i]; \ remaining /= block_len.a[i]; \ /* 计算偏移量 */ \ @@ -234,25 +234,8 @@ utils::Result getRearrangeKernel(const RearrangeParams ¶ms) { CHECK_OR_RETURN(grid_num <= MAX_GRID_ARRAY_SIZE && grid_num != 0, INFINI_STATUS_BAD_PARAM); CHECK_OR_RETURN(block_num <= MAX_BLOCK_ARRAY_SIZE && block_num != 0, INFINI_STATUS_BAD_PARAM); - CHECK_OR_RETURN(constraint_num <= 2, INFINI_STATUS_BAD_PARAM); - /* - * These variables were originally part of the CUDA implementation for this kernel. - * They have been commented out because they are not currently used in the MUSA kernel logic. - * - * This change resolves "unused variable" warnings during compilation, ensuring a clean build. - * The original declarations are preserved here for for MUSA/CUDA platform alignment. - */ - - // auto block_len = params.block_len.data(); - // auto src_block_stride = params.src_block_stride.data(); - // auto dst_block_stride = params.dst_block_stride.data(); - // auto grid_len = params.grid_len.data(); - // auto src_grid_stride = params.src_grid_stride.data(); - // auto dst_grid_stride = params.dst_grid_stride.data(); - // auto constrain = params.constraints.data(); - void *kernel_func = nullptr; #define GET_REARRANGE_KERNEL(Tmem_type, block_array_size, grid_array_size, constraint_num) \ kernel_func = (void *)rearrange_unit_##Tmem_type##_block_##block_array_size##_grid_##grid_array_size##_constrain_##constraint_num; diff --git a/src/infiniop/ops/rearrange/moore/rearrange_moore.mu b/src/infiniop/ops/rearrange/moore/rearrange_moore.mu index b4c2de390..0ccdc2568 100644 --- a/src/infiniop/ops/rearrange/moore/rearrange_moore.mu +++ b/src/infiniop/ops/rearrange/moore/rearrange_moore.mu @@ -72,7 +72,7 @@ struct SplitDim { /** * 根据给定的元数据准备张量重排参数,该函数主要完成以下工作: * 1. 根据原始元数据调整单元大小,获取更适合GPU处理的单元大小 - * 2. 将维度分配为块(block)维度和网格(grid)维度: + * 2. 将维度分配为 MUSA 块(block)维度和网格(grid)维度: * 该步骤是核心,目标是为每个block分配尽可能多的相对连续的数据进行处理, * 对无法完整放入块的维度进行分割,并记录分割维度信息,用于防止kernel访问越界,最大化内存访问局部性和计算效率 */ @@ -81,9 +81,7 @@ utils::Result prepareRearrangeParams(const utils::RearrangeMeta // 获取更适合GPU处理的单元大小,这里使用2的幂次方 auto meta_result = original_meta.distributeUnit({32, 16, 8, 4, 2, 1}); - CHECK_RESULT(meta_result); - const utils::RearrangeMeta &meta = meta_result.take(); // 获取维度信息 @@ -123,153 +121,102 @@ utils::Result prepareRearrangeParams(const utils::RearrangeMeta prev_idx_stride = idx_strides[i]; } - // 计算src_strides的降序排序索引,类似于Rust版本中的src_strides_desc_idx - std::vector src_strides_desc_idx(ndim); + std::vector block_dim_choose(ndim, false); + std::vector split_dims; + + // // 初始化计数器 + // size_t block_elements = 1; + + std::vector dim_order(ndim); for (size_t i = 0; i < ndim; ++i) { - src_strides_desc_idx[i] = i; + dim_order[i] = i; } - std::sort(src_strides_desc_idx.begin(), src_strides_desc_idx.end(), + + // 按src_stride升序排序,贪心选择 + std::sort(dim_order.begin(), dim_order.end(), [&dims](size_t a, size_t b) { - return std::abs(dims[a].src_stride) > std::abs(dims[b].src_stride); + return std::abs(dims[a].src_stride) < std::abs(dims[b].src_stride); }); - // 根据最大线程数选择block和grid维度 - const size_t block_size = max_threads; - std::vector block_dim_choose(ndim, false); + constexpr size_t MAX_BLOCK_DIM = MAX_BLOCK_ARRAY_SIZE; - // 初始化计数器 size_t block_elements = 1; - size_t block_src_elements = 1; - size_t block_dst_elements = 1; - size_t src_choose_idx = ndim; - size_t dst_choose_idx = ndim; - - // 用于存储分割维度信息 - std::vector split_dims; + size_t chosen_block_dims = 0; - // 维度选择循环 - while (src_choose_idx > 0 && dst_choose_idx > 0) { - // 获取当前需要处理的维度索引 - size_t src_idx = src_strides_desc_idx[src_choose_idx - 1]; - size_t dst_idx = dst_choose_idx - 1; - - if (src_idx == dst_idx) { - // 源和目标维度相同,可以一起处理 - size_t idx = src_idx; - size_t len = shape[idx]; - - // 检查是否可以将此维度完全添加到block中 - if (block_elements * len <= block_size) { - // 选择此维度 - block_dim_choose[idx] = true; - block_elements *= len; - block_src_elements *= len; - block_dst_elements *= len; - src_choose_idx--; - dst_choose_idx--; - } else { - // 需要分割此维度 - size_t num_per_block = block_size / block_elements; - - // 确保num_per_block > 0且len >= num_per_block - if (num_per_block > 0 && len >= num_per_block && num_per_block > 1) { - size_t num_per_grid = (len + num_per_block - 1) / num_per_block; // 向上取整 - - SplitDim split_dim = { - idx, // choose_idx - num_per_block, // num_per_block - num_per_grid, // num_per_grid - 0, // array_struct_idx_block (待更新) - 0, // array_struct_idx_grid (待更新) - len // 原始维度长度 - }; - split_dims.push_back(split_dim); - } - break; + for (size_t i = 0; i < ndim; ++i) { + size_t dim_idx = dim_order[i]; + size_t dim_len = shape[dim_idx]; + + // 1️⃣ 尝试完整放入 block + if (chosen_block_dims < MAX_BLOCK_DIM && + block_elements * dim_len <= (size_t)max_threads) { + + block_dim_choose[dim_idx] = true; + block_elements *= dim_len; + chosen_block_dims++; + continue; + } + + // 2️⃣ 尝试 split(⚠️ split 也会消耗 1 个 block dim) + if (block_elements > 1 && dim_len > 1) { + + // ⚠️ 关键:先检查 block_dim 上限 + if (chosen_block_dims + 1 > MAX_BLOCK_DIM) { + break; // 不能再加 block 维度了 } - } else { - // 源和目标维度不同,需要分别处理 - // 计算块比例 - double src_div_dst = static_cast(block_src_elements) / block_dst_elements; - double src_num_per_block = std::sqrt(block_size / (double)block_elements / src_div_dst); - double dst_num_per_block = src_num_per_block * src_div_dst; - - size_t src_current_dim_len = shape[src_idx]; - size_t dst_current_dim_len = shape[dst_idx]; - - if (static_cast(src_current_dim_len) < src_num_per_block) { - // 源维度可以完全添加到block - block_dim_choose[src_idx] = true; - block_elements *= src_current_dim_len; - block_src_elements *= src_current_dim_len; - src_choose_idx--; - } else if (static_cast(dst_current_dim_len) < dst_num_per_block) { - // 目标维度可以完全添加到block - block_dim_choose[dst_idx] = true; - block_elements *= dst_current_dim_len; - block_dst_elements *= dst_current_dim_len; - dst_choose_idx--; - } else { - // 需要分割源和目标维度 - size_t src_num_per_block_int = static_cast(std::floor(src_num_per_block)); - size_t dst_num_per_block_int = static_cast(std::floor(dst_num_per_block)); - - // 计算网格尺寸 - size_t src_num_per_grid = (src_current_dim_len + src_num_per_block_int - 1) / src_num_per_block_int; // 向上取整 - size_t dst_num_per_grid = (dst_current_dim_len + dst_num_per_block_int - 1) / dst_num_per_block_int; // 向上取整 - - // 处理源维度 - if (src_num_per_block_int > 1) { - if (src_num_per_grid == 1) { - // 可以完全放入块 - block_dim_choose[src_idx] = true; - block_elements *= src_current_dim_len; - block_src_elements *= src_current_dim_len; - src_choose_idx--; - } else { - // 需要分割 - SplitDim split_dim = { - src_idx, // choose_idx - src_num_per_block_int, // num_per_block - src_num_per_grid, // num_per_grid - 0, // array_struct_idx_block (待更新) - 0, // array_struct_idx_grid (待更新) - src_current_dim_len // 原始维度长度 - }; - split_dims.push_back(split_dim); - } - } + + size_t num_per_block = + std::min(dim_len, (size_t)max_threads / block_elements); + + if (num_per_block > 0) { + size_t num_per_grid = + (dim_len + num_per_block - 1) / num_per_block; + + split_dims.push_back({ + dim_idx, + num_per_block, + num_per_grid, + 0, + 0, + dim_len + }); + + block_elements *= num_per_block; + chosen_block_dims++; // split 占 1 个 block 维度 + } + break; + } + } + - // 处理目标维度 - if (dst_num_per_block_int > 1) { - if (dst_num_per_grid == 1) { - // 可以完全放入块 - block_dim_choose[dst_idx] = true; - block_elements *= dst_current_dim_len; - block_dst_elements *= dst_current_dim_len; - dst_choose_idx--; - } else { - // 需要分割 - SplitDim split_dim = { - dst_idx, // choose_idx - dst_num_per_block_int, // num_per_block - dst_num_per_grid, // num_per_grid - 0, // array_struct_idx_block (待更新) - 0, // array_struct_idx_grid (待更新) - dst_current_dim_len // 原始维度长度 - }; - split_dims.push_back(split_dim); - } - } - break; - } + if (block_elements == 1 && ndim > 0) { + size_t dim_idx = dim_order[0]; + size_t dim_len = shape[dim_idx]; + + if (dim_len <= (size_t)max_threads) { + block_dim_choose[dim_idx] = true; + block_elements = dim_len; + } else { + // 需要分割 + size_t num_per_block = std::min(dim_len, (size_t)max_threads); + size_t num_per_grid = (dim_len + num_per_block - 1) / num_per_block; + + SplitDim split_dim = { + dim_idx, + num_per_block, + num_per_grid, + 0, + 0, + dim_len}; + split_dims.push_back(split_dim); + block_elements = num_per_block; } } // 准备block维度相关参数 size_t block_dim = 0; - size_t block_len_total = 1; + size_t block_len_total = block_elements; std::vector block_len; std::vector src_block_stride; @@ -286,7 +233,6 @@ utils::Result prepareRearrangeParams(const utils::RearrangeMeta src_block_stride.push_back(dims[i].src_stride); dst_block_stride.push_back(dims[i].dst_stride); block_dim += 1; - block_len_total *= shape[i]; } // 处理分割维度的block部分 @@ -295,9 +241,8 @@ utils::Result prepareRearrangeParams(const utils::RearrangeMeta block_len.push_back(split_dims[j].num_per_block); src_block_stride.push_back(dims[i].src_stride); dst_block_stride.push_back(dims[i].dst_stride); - split_dims[j].array_struct_idx_block = block_dim; + split_dims[j].array_struct_idx_block = static_cast(block_dim); block_dim += 1; - block_len_total *= split_dims[j].num_per_block; } } } @@ -314,7 +259,8 @@ utils::Result prepareRearrangeParams(const utils::RearrangeMeta grid_len.push_back(split_dims[j].num_per_grid); src_grid_stride.push_back(dims[i].src_stride * split_dims[j].num_per_block); dst_grid_stride.push_back(dims[i].dst_stride * split_dims[j].num_per_block); - split_dims[j].array_struct_idx_grid = grid_len.size() - 1; + split_dims[j].array_struct_idx_grid = static_cast(grid_len.size() - 1); + break; } } @@ -348,6 +294,10 @@ utils::Result prepareRearrangeParams(const utils::RearrangeMeta constraint.grid_div_block = split_dims[i].num_per_block; constraint.total_len = split_dims[i].dim_len; constraints.push_back(constraint); + + if (constraints.size() >= 2) { + break; + } } // 设置参数 @@ -385,13 +335,13 @@ infiniStatus_t launchKernel( // 创建非const的临时变量 size_t block_dim = params.block_dim; size_t block_len_total = params.block_len_total; - + // 计算对齐后的线程块大小(Block Size)以适配 MUSA 架构的Warp特性 // - MUSA 架构以 32 线程为基本调度单位(1个 Warp) // - 通过向上取整到最近的 32 的倍数,确保线程块包含完整的 Warp // - MUSA 似乎不支持非 32 整数倍的计算 size_t aligned_block_size = ((block_len_total + 31) / 32) * 32; - block_len_total = aligned_block_size; + // block_len_total = aligned_block_size; // 确保对齐后的线程块大小不超过硬件/模板限制 if (aligned_block_size > BLOCK_SIZE) { @@ -434,7 +384,7 @@ infiniStatus_t launchKernel( // - 向上取整,数学等效:ceil(n / 32) * 32 CHECK_OR_RETURN(musaLaunchKernel( kernel_func, - grid_size, aligned_block_size, + static_cast(grid_size), static_cast(aligned_block_size), args, 0, stream) == musaSuccess, INFINI_STATUS_INTERNAL_ERROR); @@ -458,11 +408,6 @@ infiniStatus_t Descriptor::calculate( // 如果没有维度,直接进行内存拷贝 if (_meta.ndim() == 0) { - auto err = musaMemcpyAsync(y, x, _meta.unit(), musaMemcpyDeviceToDevice, musa_stream); - if (err != musaSuccess) { - return INFINI_STATUS_INTERNAL_ERROR; - } - CHECK_OR_RETURN(musaMemcpyAsync(y, x, _meta.unit(), musaMemcpyDeviceToDevice, musa_stream) == musaSuccess, INFINI_STATUS_INTERNAL_ERROR); return INFINI_STATUS_SUCCESS; @@ -497,7 +442,6 @@ infiniStatus_t Descriptor::calculate( } else if (block_size <= MOORE_BLOCK_SIZE_1024) { status = launchKernel(y, x, grid_size, params, _meta.unit(), musa_stream); } else { - std::cerr << "[ERROR] block_size=" << block_size << " exceeds max supported" << std::endl; return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; } From ee9f95146d0d111c9c39cc6d6f099a5ff256c749 Mon Sep 17 00:00:00 2001 From: zhushuang <974198603@qq.com> Date: Wed, 21 Jan 2026 14:20:04 +0800 Subject: [PATCH 2/2] issue/949 - feat: add silu_and_mul for moore gpu with test pass --- include/infinicore/ops.hpp | 1 + include/infinicore/ops/silu_and_mul.hpp | 18 ++ include/infiniop.h | 1 + include/infiniop/ops/silu_and_mul.h | 49 ++++++ python/infinicore/__init__.py | 1 + python/infinicore/nn/functional/__init__.py | 2 + .../infinicore/nn/functional/silu_and_mul.py | 19 ++ .../ops/silu_and_mul/silu_and_mul.cc | 43 +++++ .../ops/silu_and_mul/silu_and_mul_infiniop.cc | 60 +++++++ src/infinicore/pybind11/ops.hpp | 2 + src/infinicore/pybind11/ops/silu_and_mul.hpp | 31 ++++ src/infiniop/ops/silu_and_mul/info.h | 54 ++++++ .../silu_and_mul/moore/silu_and_mul_moore.h | 8 + .../silu_and_mul/moore/silu_and_mul_moore.mu | 123 +++++++++++++ src/infiniop/ops/silu_and_mul/operator.cc | 82 +++++++++ src/infiniop/ops/silu_and_mul/silu_and_mul.h | 46 +++++ test/infinicore/ops/silu_and_mul.py | 139 +++++++++++++++ test/infiniop/libinfiniop/op_register.py | 39 +++++ test/infiniop/silu_and_mul.py | 164 ++++++++++++++++++ 19 files changed, 882 insertions(+) create mode 100644 include/infinicore/ops/silu_and_mul.hpp create mode 100644 include/infiniop/ops/silu_and_mul.h create mode 100644 python/infinicore/nn/functional/silu_and_mul.py create mode 100644 src/infinicore/ops/silu_and_mul/silu_and_mul.cc create mode 100644 src/infinicore/ops/silu_and_mul/silu_and_mul_infiniop.cc create mode 100644 src/infinicore/pybind11/ops/silu_and_mul.hpp create mode 100644 src/infiniop/ops/silu_and_mul/info.h create mode 100644 src/infiniop/ops/silu_and_mul/moore/silu_and_mul_moore.h create mode 100644 src/infiniop/ops/silu_and_mul/moore/silu_and_mul_moore.mu create mode 100644 src/infiniop/ops/silu_and_mul/operator.cc create mode 100644 src/infiniop/ops/silu_and_mul/silu_and_mul.h create mode 100644 test/infinicore/ops/silu_and_mul.py create mode 100644 test/infiniop/silu_and_mul.py diff --git a/include/infinicore/ops.hpp b/include/infinicore/ops.hpp index a7249ec9d..2791c41fe 100644 --- a/include/infinicore/ops.hpp +++ b/include/infinicore/ops.hpp @@ -15,3 +15,4 @@ #include "ops/rope.hpp" #include "ops/silu.hpp" #include "ops/swiglu.hpp" +#include "ops/silu_and_mul.hpp" diff --git a/include/infinicore/ops/silu_and_mul.hpp b/include/infinicore/ops/silu_and_mul.hpp new file mode 100644 index 000000000..11a49d252 --- /dev/null +++ b/include/infinicore/ops/silu_and_mul.hpp @@ -0,0 +1,18 @@ +#pragma once + +#include "../device.hpp" +#include "../graph/graph.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +// 这个宏会自动定义 SiluAndMul 类,并包含: +// execute, dispatcher, plan_dispatcher, run_dispatcher, cleanup_dispatcher +// 以及对应的 schema 类型定义 +INFINICORE_GRAPH_OP_CLASS(SiluAndMul, Tensor, Tensor); + +// 全局辅助函数 +Tensor silu_and_mul(Tensor x); +void silu_and_mul_(Tensor out, Tensor x); + +} // namespace infinicore::op diff --git a/include/infiniop.h b/include/infiniop.h index c0a09fcb4..9c786d51e 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -34,6 +34,7 @@ #include "infiniop/ops/topkrouter.h" #include "infiniop/ops/topksoftmax.h" #include "infiniop/ops/zeros.h" +#include "infiniop/ops/silu_and_mul.h" #include "infiniop/tensor_descriptor.h" #endif // __INFINIOP_API_H__ diff --git a/include/infiniop/ops/silu_and_mul.h b/include/infiniop/ops/silu_and_mul.h new file mode 100644 index 000000000..e19675b2a --- /dev/null +++ b/include/infiniop/ops/silu_and_mul.h @@ -0,0 +1,49 @@ +#ifndef __INFINIOP_SILU_AND_MUL_API_H__ +#define __INFINIOP_SILU_AND_MUL_API_H__ + +#include "../operator_descriptor.h" + +// 定义描述符类型 +typedef struct InfiniopDescriptor *infiniopSiluAndMulDescriptor_t; + +/** + * @brief 创建 SiluAndMul 算子描述符 + * * 公式: output = silu(input_front) * input_back + * 其中 input 形状为 [..., 2*d], output 形状为 [..., d] + */ +__C __export infiniStatus_t infiniopCreateSiluAndMulDescriptor( + infiniopHandle_t handle, + infiniopSiluAndMulDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +/** + * @brief 获取算子执行所需的临时空间大小 + */ +__C __export infiniStatus_t infiniopGetSiluAndMulWorkspaceSize( + infiniopSiluAndMulDescriptor_t desc, + size_t *size); + +/** + * @brief 执行 SiluAndMul 计算 + * * @param workspace 临时空间指针 + * @param workspace_size 临时空间大小 + * @param output 输出张量数据指针 [..., d] + * @param input 输入张量数据指针 [..., 2*d] + * @param stream 硬件流指针 (如 musaStream_t) + */ +__C __export infiniStatus_t infiniopSiluAndMul( + infiniopSiluAndMulDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +/** + * @brief 销毁描述符并释放相关资源 + */ +__C __export infiniStatus_t infiniopDestroySiluAndMulDescriptor( + infiniopSiluAndMulDescriptor_t desc); + +#endif // __INFINIOP_SILU_AND_MUL_API_H__ diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index c6b01d5aa..e0a1e34f9 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -3,6 +3,7 @@ import infinicore.context as context import infinicore.nn as nn + # Import context functions from infinicore.context import ( get_device, diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py index 255079790..9131d70a4 100644 --- a/python/infinicore/nn/functional/__init__.py +++ b/python/infinicore/nn/functional/__init__.py @@ -6,6 +6,7 @@ from .rope import RopeAlgo, rope from .silu import silu from .swiglu import swiglu +from .silu_and_mul import silu_and_mul __all__ = [ "causal_softmax", @@ -17,4 +18,5 @@ "embedding", "rope", "RopeAlgo", + "silu_and_mul", ] diff --git a/python/infinicore/nn/functional/silu_and_mul.py b/python/infinicore/nn/functional/silu_and_mul.py new file mode 100644 index 000000000..53fe66dfd --- /dev/null +++ b/python/infinicore/nn/functional/silu_and_mul.py @@ -0,0 +1,19 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def silu_and_mul(input: Tensor, out=None) -> Tensor: + r"""Apply the SiLU and Mul (SwiGLU) function. + + Formula: output = SiLU(input_gate) * input_up + Input shape: [..., 2*d], Output shape: [..., d] + """ + + if out is None: + # 调用 C++ 非原地接口,内部处理输出 Tensor 的创建 + return Tensor(_infinicore.silu_and_mul(input._underlying)) + + # 调用 C++ 原地/指定输出接口 + _infinicore.silu_and_mul_(out._underlying, input._underlying) + + return out diff --git a/src/infinicore/ops/silu_and_mul/silu_and_mul.cc b/src/infinicore/ops/silu_and_mul/silu_and_mul.cc new file mode 100644 index 000000000..ed871e483 --- /dev/null +++ b/src/infinicore/ops/silu_and_mul/silu_and_mul.cc @@ -0,0 +1,43 @@ +#include "infinicore/ops/silu_and_mul.hpp" +#include "../../utils.hpp" + +namespace infinicore::op { + +// 实现分发器 +INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(SiluAndMul); + +// 构造函数:校验设备并分发 +SiluAndMul::SiluAndMul(Tensor out, Tensor x) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, x); + // 根据设备类型(如 Moore, Cuda 等)路由到具体的实现 + INFINICORE_GRAPH_OP_DISPATCH(out->device().getType(), out, x); +} + +// 执行接口:在图模式下记录或在即时模式下运行 +void SiluAndMul::execute(Tensor out, Tensor x) { + INFINICORE_GRAPH_OP_RECORD_OR_RUN(SiluAndMul, out, x); +} + +// 非原地接口:负责推导输出形状并分配内存 +Tensor silu_and_mul(Tensor x) { + Shape shape = x->shape(); + size_t ndim = x->ndim(); + + // SwiGLU 逻辑:输出最后一维是输入的一半 + if (shape[ndim - 1] % 2 != 0) { + throw std::runtime_error("SiluAndMul input last dim must be even."); + } + shape[ndim - 1] /= 2; + + // 创建输出张量 + auto out = Tensor::empty(shape, x->dtype(), x->device()); + silu_and_mul_(out, x); + return out; +} + +// 原地/指定输出接口 +void silu_and_mul_(Tensor out, Tensor x) { + SiluAndMul::execute(out, x); +} + +} // namespace infinicore::op diff --git a/src/infinicore/ops/silu_and_mul/silu_and_mul_infiniop.cc b/src/infinicore/ops/silu_and_mul/silu_and_mul_infiniop.cc new file mode 100644 index 000000000..c141e127e --- /dev/null +++ b/src/infinicore/ops/silu_and_mul/silu_and_mul_infiniop.cc @@ -0,0 +1,60 @@ +#include "../infiniop_impl.hpp" +#include "infinicore/ops/silu_and_mul.hpp" + +namespace infinicore::op::silu_and_mul_impl::infiniop { + +// 定义可缓存的描述符,用于避免频繁创建/销毁 infiniopDescriptor +INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, SiluAndMul, 100); + +// 定义图执行模式所需的元数据 +struct PlannedMeta { + std::shared_ptr descriptor; + graph::GraphTensor workspace, output, input; +}; + +// 预执行阶段:创建描述符并关联张量 +void *plan(Tensor output, Tensor input) { + // 根据张量的描述符(形状、类型等)生成唯一 Hash Seed + size_t seed = hash_combine(output, input); + + // 获取缓存的描述符或创建新描述符 + INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE( + Descriptor, descriptor, SiluAndMul, + seed, output->desc(), input->desc()); + + // 分配工作空间张量(SwiGLU 如果需要的话,由 descriptor->workspace_size 决定) + INFINIOP_WORKSPACE_TENSOR(workspace, SiluAndMul, descriptor); + + auto planned = new PlannedMeta{ + descriptor, + graph::GraphTensor(workspace), + graph::GraphTensor(output), + graph::GraphTensor(input)}; + + return planned; +} + +// 实际执行阶段 +void run(void *planned_meta) { + auto planned = reinterpret_cast(planned_meta); + + // 调用我们在之前步骤中实现的 infiniop 接口 + INFINICORE_CHECK_ERROR(infiniopSiluAndMul( + planned->descriptor->desc, + planned->workspace->data(), + planned->workspace->numel(), + planned->output->data(), + planned->input->data(), + context::getStream())); +} + +// 清理逻辑 +void cleanup(void **planned_meta_ptr) { + delete *reinterpret_cast(planned_meta_ptr); + *planned_meta_ptr = nullptr; +} + +// 注册算子到所有支持的设备 +INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(SiluAndMul, &plan, &run, &cleanup); + +} // namespace infinicore::op::silu_and_mul_impl::infiniop diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index 3d6ebe79a..4340b5b8a 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -19,6 +19,7 @@ #include "ops/rope.hpp" #include "ops/silu.hpp" #include "ops/swiglu.hpp" +#include "ops/silu_and_mul.hpp" namespace py = pybind11; @@ -42,6 +43,7 @@ inline void bind(py::module &m) { bind_swiglu(m); bind_rope(m); bind_embedding(m); + bind_silu_and_mul(m); } } // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/silu_and_mul.hpp b/src/infinicore/pybind11/ops/silu_and_mul.hpp new file mode 100644 index 000000000..379575868 --- /dev/null +++ b/src/infinicore/pybind11/ops/silu_and_mul.hpp @@ -0,0 +1,31 @@ +#pragma once + +#include + +#include "infinicore/ops/silu_and_mul.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_silu_and_mul(py::module &m) { + // 绑定非原地函数: Tensor silu_and_mul(Tensor input) + m.def("silu_and_mul", + &op::silu_and_mul, + py::arg("input"), + R"doc( + SiLU and Mul (SwiGLU) activation function. + Input should be [..., 2*d], output will be [..., d]. + )doc"); + + // 绑定原地/指定输出函数: void silu_and_mul_(Tensor output, Tensor input) + m.def("silu_and_mul_", + &op::silu_and_mul_, + py::arg("output"), + py::arg("input"), + R"doc( + In-place or destination-specified SiLU and Mul (SwiGLU) activation function. + )doc"); +} + +} // namespace infinicore::ops diff --git a/src/infiniop/ops/silu_and_mul/info.h b/src/infiniop/ops/silu_and_mul/info.h new file mode 100644 index 000000000..c7be1ddff --- /dev/null +++ b/src/infiniop/ops/silu_and_mul/info.h @@ -0,0 +1,54 @@ +#ifndef __SILU_AND_MUL_INFO_H__ +#define __SILU_AND_MUL_INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include + +namespace op::silu_and_mul { + + class SiluAndMulInfo { + SiluAndMulInfo() = default; + + public: + infiniDtype_t dtype; + size_t batch_size; + size_t out_hidden_dim; + + static utils::Result create(infiniopTensorDescriptor_t y_desc, infiniopTensorDescriptor_t x_desc) { + auto dtype = y_desc->dtype(); + + auto x_shape = x_desc->shape(); + auto y_shape = y_desc->shape(); + auto ndim = x_desc->ndim(); + + if (ndim != y_desc->ndim()) { + return INFINI_STATUS_BAD_PARAM; + } + + if (x_shape[ndim - 1] != 2 * y_shape[ndim - 1]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + size_t batch = 1; + for (int i = 0; i < (int)ndim - 1; ++i) { + if (x_shape[i] != y_shape[i]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + batch *= y_shape[i]; + } + + return utils::Result(SiluAndMulInfo{ + dtype, + batch, + y_shape[ndim - 1]}); + } + + private: + SiluAndMulInfo(infiniDtype_t dtype, size_t batch, size_t hidden) + : dtype(dtype), batch_size(batch), out_hidden_dim(hidden) {} + }; + +} + +#endif // __SILU_AND_MUL_INFO_H__ diff --git a/src/infiniop/ops/silu_and_mul/moore/silu_and_mul_moore.h b/src/infiniop/ops/silu_and_mul/moore/silu_and_mul_moore.h new file mode 100644 index 000000000..2789e2de6 --- /dev/null +++ b/src/infiniop/ops/silu_and_mul/moore/silu_and_mul_moore.h @@ -0,0 +1,8 @@ +#ifndef __SILU_ADN_MUL_MOORE_API_H__ +#define __SILU_ADN_MUL_MOORE_API_H__ + +#include "../silu_and_mul.h" + +DESCRIPTOR(moore) + +#endif // __SILU_ADN_MUL_MOORE_API_H__ diff --git a/src/infiniop/ops/silu_and_mul/moore/silu_and_mul_moore.mu b/src/infiniop/ops/silu_and_mul/moore/silu_and_mul_moore.mu new file mode 100644 index 000000000..48fcb9609 --- /dev/null +++ b/src/infiniop/ops/silu_and_mul/moore/silu_and_mul_moore.mu @@ -0,0 +1,123 @@ +#include "../../../devices/moore/moore_common.h" +#include "../../../devices/moore/moore_handle.h" +#include "silu_and_mul_moore.h" + +#include +#include + +namespace op::silu_and_mul::moore { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + + if (!desc_ptr) { + return INFINI_STATUS_BAD_PARAM; + } + + auto handle = reinterpret_cast(handle_); + auto dtype = y_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + if (x_desc->dtype() != dtype) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + auto result = SiluAndMulInfo::create(y_desc, x_desc); + CHECK_RESULT(result); + auto info = result.take(); + + *desc_ptr = new Descriptor( + new Opaque{handle->internal()}, + std::move(info), + 0, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t calculate_impl( + const SiluAndMulInfo &info, + std::shared_ptr &internal, + void *y, + const void *x, + void *stream) { + + return internal->useMudnn( + (musaStream_t)stream, + [&](::musa::dnn::Handle &mudnn_handle) -> infiniStatus_t { + + ::musa::dnn::Tensor x_t, y_t; + + if constexpr (std::is_same_v) { + x_t.SetType(::musa::dnn::Tensor::Type::HALF); + y_t.SetType(::musa::dnn::Tensor::Type::HALF); + } else if constexpr (std::is_same_v) { + x_t.SetType(::musa::dnn::Tensor::Type::BFLOAT16); + y_t.SetType(::musa::dnn::Tensor::Type::BFLOAT16); + } else { + x_t.SetType(::musa::dnn::Tensor::Type::FLOAT); + y_t.SetType(::musa::dnn::Tensor::Type::FLOAT); + } + + x_t.SetAddr(const_cast(x)); + y_t.SetAddr(y); + + // --- Construct 2D dimension information --- + // Explicitly distinguish between Batch and Hidden dimensions + int64_t b = static_cast(info.batch_size); + int64_t h = static_cast(info.out_hidden_dim); + + // Input x logical shape is [batch, 2 * hidden] + std::array x_dims = {b, h * 2}; + std::array x_strides = {h * 2, 1}; + + // Output y logical shape is [batch, hidden] + std::array y_dims = {b, h}; + std::array y_strides = {h, 1}; + + x_t.SetNdInfo(2, x_dims.data(), x_strides.data()); + y_t.SetNdInfo(2, y_dims.data(), y_strides.data()); + + // Invoke muDNN SwiGLU + // muDNN will split each row (length 2*h) internally, + // muDNN treats the first h elements of input x as the 'gate' + // and the following h elements as the 'up' projection. + ::musa::dnn::SwiGlu swiglu; + swiglu.Run(mudnn_handle, y_t, x_t); + + return INFINI_STATUS_SUCCESS; + }); +} + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + void *y, const void *x, + void *stream) const { + + infiniDtype_t dtype = _info.dtype; + + switch (dtype) { + case INFINI_DTYPE_F16: + return calculate_impl(_info, _opaque->internal, y, x, stream); + case INFINI_DTYPE_F32: + return calculate_impl(_info, _opaque->internal, y, x, stream); + case INFINI_DTYPE_BF16: + return calculate_impl<__mt_bfloat16>(_info, _opaque->internal, y, x, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +} // namespace op::silu_and_mul::moore diff --git a/src/infiniop/ops/silu_and_mul/operator.cc b/src/infiniop/ops/silu_and_mul/operator.cc new file mode 100644 index 000000000..85f651607 --- /dev/null +++ b/src/infiniop/ops/silu_and_mul/operator.cc @@ -0,0 +1,82 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/silu_and_mul.h" + +#ifdef ENABLE_MOORE_API +#include "moore/silu_and_mul_moore.h" +#endif + +__C infiniStatus_t infiniopCreateSiluAndMulDescriptor( + infiniopHandle_t handle, + infiniopSiluAndMulDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::silu_and_mul::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + x_desc); + + switch (handle->device) { +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + + +__C infiniStatus_t infiniopGetSiluAndMulWorkspaceSize(infiniopSiluAndMulDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + + +__C infiniStatus_t infiniopSiluAndMul( + infiniopSiluAndMulDescriptor_t desc, + void *workspace, size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, workspace_size, y, x, stream); + + switch (desc->device_type) { +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + + +__C infiniStatus_t infiniopDestroySiluAndMulDescriptor(infiniopSiluAndMulDescriptor_t desc) { + +#define DESTROY(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_MOORE_API + DESTROY(INFINI_DEVICE_MOORE, moore); +#endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} diff --git a/src/infiniop/ops/silu_and_mul/silu_and_mul.h b/src/infiniop/ops/silu_and_mul/silu_and_mul.h new file mode 100644 index 000000000..ced75e68d --- /dev/null +++ b/src/infiniop/ops/silu_and_mul/silu_and_mul.h @@ -0,0 +1,46 @@ +#ifndef SILU_AND_MUL_H +#define SILU_AND_MUL_H + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::silu_and_mul::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + SiluAndMulInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + SiluAndMulInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t y_desc, \ + infiniopTensorDescriptor_t x_desc); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + void *y, \ + const void *x, \ + void *stream) const; \ + }; \ + } + +#endif // SILU_AND_MUL_H diff --git a/test/infinicore/ops/silu_and_mul.py b/test/infinicore/ops/silu_and_mul.py new file mode 100644 index 000000000..0d7bfb8bf --- /dev/null +++ b/test/infinicore/ops/silu_and_mul.py @@ -0,0 +1,139 @@ +import sys +import os + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +import torch +import infinicore +from framework import ( + BaseOperatorTest, + TensorSpec, + TestCase, + GenericTestRunner, + is_broadcast, +) + +# ============================================================================== +# Operator-specific configuration +# ============================================================================== + +# Test cases format: (input_shape, input_strides, output_strides) +# SiLUAndMul: Input (..., 2*d) -> Output (..., d) +_TEST_CASES_DATA = [ + # Basic 2D: [2, 4] -> [2, 2] + ((2, 4), None, None), + # 2D Large: [1024, 1024] -> [1024, 512] + ((1024, 1024), None, None), + # 3D: [2, 4, 8] -> [2, 4, 4] + ((2, 4, 8), None, None), + # LLM typical hidden size (e.g., Llama SwiGLU: intermediate_size=11008) + # [1, 11008*2] -> [1, 11008] + ((1, 22016), None, None), + # Strided tensors + ((2, 4, 256), None, None), +] + +# Tolerance configuration +_TOLERANCE_MAP = { + infinicore.float16: {"atol": 1e-3, "rtol": 1e-3}, + infinicore.float32: {"atol": 1e-5, "rtol": 1e-5}, + infinicore.bfloat16: {"atol": 5e-3, "rtol": 1e-2}, +} + +# Data types to test +_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32] + + +def parse_test_cases(): + """ + Parse SiLUAndMul test case data. + Input: [..., 2*d], Output: [..., d] + """ + test_cases = [] + + for data in _TEST_CASES_DATA: + input_shape = data[0] + input_strides = data[1] if len(data) > 1 else None + output_strides = data[2] if len(data) > 2 else None + + # 推导输出形状:最后一维减半 + output_shape = list(input_shape) + output_shape[-1] //= 2 + output_shape = tuple(output_shape) + + # SiLUAndMul 不支持原地 (In-place on input),因为形状不匹配 + # 但支持指定输出 Tensor (out=output) + output_supports_explicit_out = not is_broadcast(output_strides) + + for dtype in _TENSOR_DTYPES: + tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4}) + + input_spec = TensorSpec.from_tensor(input_shape, input_strides, dtype) + output_spec = TensorSpec.from_tensor(output_shape, output_strides, dtype) + + # Test Case 1: Out-of-place (return value) + test_cases.append( + TestCase( + inputs=[input_spec], + kwargs={}, + output_spec=None, + comparison_target=None, + tolerance=tolerance, + description=f"SiLUAndMul - OUT_OF_PLACE", + ) + ) + + # Test Case 2: Explicit output tensor (silu_and_mul(input, out=output)) + if output_supports_explicit_out: + test_cases.append( + TestCase( + inputs=[input_spec], + kwargs=None, + output_spec=output_spec, + comparison_target="out", + tolerance=tolerance, + description=f"SiLUAndMul - OUT_PARAM", + ) + ) + + return test_cases + + +class OpTest(BaseOperatorTest): + """SiLUAndMul operator test (SwiGLU)""" + + def __init__(self): + super().__init__("SiLUAndMul") + + def get_test_cases(self): + return parse_test_cases() + + def torch_operator(self, input, out=None, **kwargs): + """PyTorch SwiGLU reference: SiLU(x_gate) * x_up""" + d = input.shape[-1] // 2 + # 将最后一维切分为两部分 + gate, up = torch.split(input, [d, d], dim=-1) + result = torch.nn.functional.silu(gate) * up + + if out is not None: + out.copy_(result) + return out + return result + + def infinicore_operator(self, input, out=None, **kwargs): + """InfiniCore SiLUAndMul implementation""" + + import infinicore.nn.functional as F + + return F.silu_and_mul(input, out=out) + + + +def main(): + """Main entry point""" + runner = GenericTestRunner(OpTest) + runner.run_and_exit() + + +if __name__ == "__main__": + main() diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index 618be2b05..c034fa527 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -1144,3 +1144,42 @@ def paged_attention_prefill_(lib): lib.infiniopDestroyPagedAttentionPrefillDescriptor.argtypes = [ infiniopOperatorDescriptor_t, ] + +@OpRegister.operator +def silu_and_mul(lib): + # 1. 注册创建描述符的接口 + # 参数: handle, desc_ptr, output_desc, input_desc + lib.infiniopCreateSiluAndMulDescriptor.restype = c_int32 + lib.infiniopCreateSiluAndMulDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + # 2. 注册获取 Workspace 大小的接口 + # 参数: desc, size_ptr + lib.infiniopGetSiluAndMulWorkspaceSize.restype = c_int32 + lib.infiniopGetSiluAndMulWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + # 3. 注册执行计算的接口 + # 参数: desc, workspace, workspace_size, output_ptr, input_ptr, stream + lib.infiniopSiluAndMul.restype = c_int32 + lib.infiniopSiluAndMul.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + # 4. 注册销毁描述符的接口 + # 参数: desc + lib.infiniopDestroySiluAndMulDescriptor.restype = c_int32 + lib.infiniopDestroySiluAndMulDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] diff --git a/test/infiniop/silu_and_mul.py b/test/infiniop/silu_and_mul.py new file mode 100644 index 000000000..3210a2b91 --- /dev/null +++ b/test/infiniop/silu_and_mul.py @@ -0,0 +1,164 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# Format: (input_shape, output_shape) +# Referencing vLLM kernel Silu_and_Mul interface: +# input_shape is [..., 2*d], output_shape is [..., d] +_TEST_CASES = [ + # input_shape, output_shape + ((2, 8), (2, 4)), + ((1024, 1024), (1024, 512)), + ((16, 8192), (16, 4096)), + ((2, 128, 2048), (2, 128, 1024)), + ((8, 1, 4096), (8, 1, 2048)), + ((2, 4, 16, 256), (2, 4, 16, 128)), +] + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 100 + +# PyTorch reference: silu(gate) * up where [gate, up] = split(input) +def silu_and_mul_torch(out, input_tensor): + """ + Computes the SwiGLU activation function: SiLU(gate) * up. + """ + # Split the last dimension into two halves: + # the first half is 'gate', the second is 'up' + d = input_tensor.shape[-1] // 2 + gate = input_tensor[..., :d] + up = input_tensor[..., d:] + + # Apply SiLU to the gate and multiply by the up projection + torch.mul(torch.nn.functional.silu(gate), up, out=out) + +# ============================================================================== +# Test Logic +# ============================================================================== +def test( + handle, + device, + input_shape, + output_shape, + dtype=InfiniDtype.F16, + sync=None, +): + print( + f"Testing SiluAndMul on {InfiniDeviceNames[device]} with " + f"input_shape:{input_shape} output_shape:{output_shape} dtype:{InfiniDtypeNames[dtype]}" + ) + + a = TestTensor(input_shape, None, dtype, device) + c = TestTensor(output_shape, None, dtype, device, mode="zeros") + ans = TestTensor(output_shape, None, dtype, device, mode="zeros") + + # Only support contiguous Tensor + if not (a.torch_tensor().is_contiguous() and + c.torch_tensor().is_contiguous() and + ans.torch_tensor().is_contiguous()): + raise ValueError("This operator only supports contiguous memory layout.") + + # PyTorch answer reference + def torch_silu_and_mul_reference(): + silu_and_mul_torch(ans.torch_tensor(), a.torch_tensor()) + + torch_silu_and_mul_reference() + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateSiluAndMulDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + ) + ) + + for tensor in [a, c]: + tensor.destroy_desc() + + # Workspace + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetSiluAndMulWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, device) + + def lib_op(): + check_error( + LIBINFINIOP.infiniopSiluAndMul( + descriptor, + workspace.data(), + workspace_size.value, + c.data(), + a.data(), + None, + ) + ) + + lib_op() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + + if DEBUG: + debug(c.actual_tensor(), ans.torch_tensor(), atol=atol, rtol=rtol) + + assert torch.allclose(c.actual_tensor(), ans.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + profile_operation("PyTorch", lambda: torch_silu_and_mul_reference(), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_op(), device, NUM_PRERUN, NUM_ITERATIONS) + + check_error(LIBINFINIOP.infiniopDestroySiluAndMulDescriptor(descriptor)) + + +# ============================================================================== +# Main Execution +# ============================================================================== +if __name__ == "__main__": + args = get_args() + + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mSiluAndMul Test passed!\033[0m")