From 7538ddf8d3213340fae189853966c7302cc1540a Mon Sep 17 00:00:00 2001
From: zhushuang <974198603@qq.com>
Date: Tue, 13 Jan 2026 10:07:07 +0800
Subject: [PATCH 1/2] issue/899 - fix: fix causal_softmax and rearrange bug

---
 .../moore/causal_softmax_kernel.h             |   2 +-
 .../ops/rearrange/moore/rearrange_kernel.h    |  37 +--
 .../ops/rearrange/moore/rearrange_moore.mu    | 238 +++++++-----------
 3 files changed, 102 insertions(+), 175 deletions(-)

diff --git a/src/infiniop/ops/causal_softmax/moore/causal_softmax_kernel.h b/src/infiniop/ops/causal_softmax/moore/causal_softmax_kernel.h
index 19d364552..111e7cd4c 100644
--- a/src/infiniop/ops/causal_softmax/moore/causal_softmax_kernel.h
+++ b/src/infiniop/ops/causal_softmax/moore/causal_softmax_kernel.h
@@ -28,7 +28,7 @@ __device__ void causalSoftmaxKernel(
         //          1 | * * * ... * *   |
         //          2 | * * * ... * * * |
         //  height: 3  col_id->
-        if (width + blockIdx.x >= threadIdx.x + height) {
+        if (width + blockIdx.x >= col + height) {
             if constexpr (std::is_same_v<Tdata, half> || std::is_same_v<Tdata, cuda_bfloat16>) {
                 /*
                  * MUSA does not support CUDA's native `hexp` function.
diff --git a/src/infiniop/ops/rearrange/moore/rearrange_kernel.h b/src/infiniop/ops/rearrange/moore/rearrange_kernel.h
index b62bd507c..b11084015 100644
--- a/src/infiniop/ops/rearrange/moore/rearrange_kernel.h
+++ b/src/infiniop/ops/rearrange/moore/rearrange_kernel.h
@@ -31,7 +31,7 @@ struct Constraint {
 
 // 定义宏生成内核函数
 #define DEFINE_REARRANGE_KERNEL(Tmem_type, constraint_num, block_array_size, grid_array_size)                                                                                    \
-    extern "C" __global__ void rearrange_unit_##Tmem_type##_block_##block_array_size##_grid_##grid_array_size##_constrain_##constraint_num(                                      \
+    extern "C" INFINIOP_MOORE_KERNEL rearrange_unit_##Tmem_type##_block_##block_array_size##_grid_##grid_array_size##_constrain_##constraint_num(                                 \
         void *__restrict__ dst,                                                                                                                                                  \
         const void *__restrict__ src,                                                                                                                                            \
         const size_t block_dim,                                                                                                                                                  \
@@ -64,13 +64,13 @@ struct Constraint {
                 size_t remaining                                                                                                                                                 \
                     = blockIdx.x;                                                                                                                                                \
                                                                                                                                                                                  \
-                for (ssize_t i = grid_array_size - 1; i >= 0; i--) {                                                                                                             \
+                for (ptrdiff_t i = grid_array_size - 1; i >= 0; i--) {                                                                                                           \
                     size_t idx = remaining % grid_len.a[i];                                                                                                                      \
                     remaining /= grid_len.a[i];                                                                                                                                  \
                     src_offset += idx * src_grid_stride.a[i];                                                                                                                    \
                     dst_offset += idx * dst_grid_stride.a[i];                                                                                                                    \
                     if (constraint_num > 0) {                                                                                                                                    \
-                        for (ssize_t j = 0; j < constraint_num; j++) {                                                                                                           \
+                        for (ptrdiff_t j = 0; j < constraint_num; j++) {                                                                                                         \
                             if (i == constraints.a[j].grid_idx) {                                                                                                                \
                                 constraints_grid_idx_multiple[j] = idx * constraints.a[j].grid_div_block;                                                                        \
                             }                                                                                                                                                    \
@@ -81,7 +81,7 @@ struct Constraint {
                 /* 将结果存入共享内存 */                                                                                                                                \
                 shared_src_offset = src_offset;                                                                                                                                  \
                 shared_dst_offset = dst_offset;                                                                                                                                  \
-                for (ssize_t j = 0; j < constraint_num; j++) {                                                                                                                   \
+                for (ptrdiff_t j = 0; j < constraint_num; j++) {                                                                                                                 \
                     shared_constraints_grid_idx_multiple[j] = constraints_grid_idx_multiple[j];                                                                                  \
                 }                                                                                                                                                                \
             }                                                                                                                                                                    \
@@ -93,18 +93,18 @@ struct Constraint {
             ptrdiff_t src_offset = shared_src_offset;                                                                                                                            \
             ptrdiff_t dst_offset = shared_dst_offset;                                                                                                                            \
             ARRAY_TYPE_SIZE constraints_grid_idx_multiple[constraint_num > 0 ? constraint_num : 1];                                                                              \
-            for (ssize_t j = 0; j < constraint_num; j++) {                                                                                                                       \
+            for (ptrdiff_t j = 0; j < constraint_num; j++) {                                                                                                                     \
                 constraints_grid_idx_multiple[j] = shared_constraints_grid_idx_multiple[j];                                                                                      \
             }                                                                                                                                                                    \
                                                                                                                                                                                  \
-            for (ssize_t i = block_array_size - 1; i >= 0; i--) {                                                                                                                \
+            for (ptrdiff_t i = block_array_size - 1; i >= 0; i--) {                                                                                                              \
                 size_t idx = remaining % block_len.a[i];                                                                                                                         \
                 remaining /= block_len.a[i];                                                                                                                                     \
                 /* 计算偏移量 */                                                                                                                                            \
                 src_offset += idx * src_block_stride.a[i];                                                                                                                       \
                 dst_offset += idx * dst_block_stride.a[i];                                                                                                                       \
                 if (constraint_num > 0) {                                                                                                                                        \
-                    for (ssize_t j = 0; j < constraint_num; j++) {                                                                                                               \
+                    for (ptrdiff_t j = 0; j < constraint_num; j++) {                                                                                                             \
                         if (i == constraints.a[j].block_idx) {                                                                                                                   \
                             if (constraints_grid_idx_multiple[j] + idx >= constraints.a[j].total_len) {                                                                          \
                                 return;                                                                                                                                          \
@@ -116,7 +116,7 @@ struct Constraint {
                                                                                                                                                                                  \
             src_offset += remaining * src_block_stride.a[0];                                                                                                                     \
             dst_offset += remaining * dst_block_stride.a[0];                                                                                                                     \
-            for (ssize_t j = 0; j < constraint_num; j++) {                                                                                                                       \
+            for (ptrdiff_t j = 0; j < constraint_num; j++) {                                                                                                                     \
                 if (0 == constraints.a[j].block_idx) {                                                                                                                           \
                     if (constraints_grid_idx_multiple[j] + remaining >= constraints.a[j].total_len) {                                                                            \
                         return;                                                                                                                                                  \
@@ -134,7 +134,7 @@ struct Constraint {
                 ptrdiff_t dst_offset = 0;                                                                                                                                        \
                 size_t remaining = blockIdx.x;                                                                                                                                   \
                                                                                                                                                                                  \
-                for (ssize_t i = grid_array_size - 1; i >= 0; i--) {                                                                                                             \
+                for (ptrdiff_t i = grid_array_size - 1; i >= 0; i--) {                                                                                                           \
                     size_t idx = remaining % grid_len.a[i];                                                                                                                      \
                     remaining /= grid_len.a[i];                                                                                                                                  \
                     src_offset += idx * src_grid_stride.a[i];                                                                                                                    \
@@ -153,7 +153,7 @@ struct Constraint {
             ptrdiff_t src_offset = shared_src_offset;                                                                                                                            \
             ptrdiff_t dst_offset = shared_dst_offset;                                                                                                                            \
                                                                                                                                                                                  \
-            for (ssize_t i = block_array_size - 1; i > 0; i--) {                                                                                                                 \
+            for (ptrdiff_t i = block_array_size - 1; i > 0; i--) {                                                                                                               \
                 size_t idx = remaining % block_len.a[i];                                                                                                                         \
                 remaining /= block_len.a[i];                                                                                                                                     \
                 /* 计算偏移量 */                                                                                                                                            \
@@ -234,25 +234,8 @@ utils::Result<void *> getRearrangeKernel(const RearrangeParams &params) {
 
     CHECK_OR_RETURN(grid_num <= MAX_GRID_ARRAY_SIZE && grid_num != 0, INFINI_STATUS_BAD_PARAM);
     CHECK_OR_RETURN(block_num <= MAX_BLOCK_ARRAY_SIZE && block_num != 0, INFINI_STATUS_BAD_PARAM);
-
     CHECK_OR_RETURN(constraint_num <= 2, INFINI_STATUS_BAD_PARAM);
 
-    /*
-     * These variables were originally part of the CUDA implementation for this kernel.
-     * They have been commented out because they are not currently used in the MUSA kernel logic.
-     *
-     * This change resolves "unused variable" warnings during compilation, ensuring a clean build.
-     * The original declarations are preserved here for for MUSA/CUDA platform alignment.
-     */
-
-    // auto block_len = params.block_len.data();
-    // auto src_block_stride = params.src_block_stride.data();
-    // auto dst_block_stride = params.dst_block_stride.data();
-    // auto grid_len = params.grid_len.data();
-    // auto src_grid_stride = params.src_grid_stride.data();
-    // auto dst_grid_stride = params.dst_grid_stride.data();
-    // auto constrain = params.constraints.data();
-
     void *kernel_func = nullptr;
 #define GET_REARRANGE_KERNEL(Tmem_type, block_array_size, grid_array_size, constraint_num) \
     kernel_func = (void *)rearrange_unit_##Tmem_type##_block_##block_array_size##_grid_##grid_array_size##_constrain_##constraint_num;
diff --git a/src/infiniop/ops/rearrange/moore/rearrange_moore.mu b/src/infiniop/ops/rearrange/moore/rearrange_moore.mu
index b4c2de390..0ccdc2568 100644
--- a/src/infiniop/ops/rearrange/moore/rearrange_moore.mu
+++ b/src/infiniop/ops/rearrange/moore/rearrange_moore.mu
@@ -72,7 +72,7 @@ struct SplitDim {
 /**
  * 根据给定的元数据准备张量重排参数，该函数主要完成以下工作：
  * 1. 根据原始元数据调整单元大小，获取更适合GPU处理的单元大小
- * 2. 将维度分配为块（block）维度和网格（grid）维度：
+ * 2. 将维度分配为 MUSA 块（block）维度和网格（grid）维度：
  *    该步骤是核心，目标是为每个block分配尽可能多的相对连续的数据进行处理，
  *    对无法完整放入块的维度进行分割，并记录分割维度信息，用于防止kernel访问越界，最大化内存访问局部性和计算效率
  */
@@ -81,9 +81,7 @@ utils::Result<RearrangeParams> prepareRearrangeParams(const utils::RearrangeMeta
 
     // 获取更适合GPU处理的单元大小，这里使用2的幂次方
     auto meta_result = original_meta.distributeUnit({32, 16, 8, 4, 2, 1});
-
     CHECK_RESULT(meta_result);
-
     const utils::RearrangeMeta &meta = meta_result.take();
 
     // 获取维度信息
@@ -123,153 +121,102 @@ utils::Result<RearrangeParams> prepareRearrangeParams(const utils::RearrangeMeta
         prev_idx_stride = idx_strides[i];
     }
 
-    // 计算src_strides的降序排序索引，类似于Rust版本中的src_strides_desc_idx
-    std::vector<size_t> src_strides_desc_idx(ndim);
+    std::vector<bool> block_dim_choose(ndim, false);
+    std::vector<SplitDim> split_dims;
+
+    // // 初始化计数器
+    // size_t block_elements = 1;
+
+    std::vector<size_t> dim_order(ndim);
     for (size_t i = 0; i < ndim; ++i) {
-        src_strides_desc_idx[i] = i;
+        dim_order[i] = i;
     }
-    std::sort(src_strides_desc_idx.begin(), src_strides_desc_idx.end(),
+
+    // 按src_stride升序排序，贪心选择
+    std::sort(dim_order.begin(), dim_order.end(),
               [&dims](size_t a, size_t b) {
-                  return std::abs(dims[a].src_stride) > std::abs(dims[b].src_stride);
+                  return std::abs(dims[a].src_stride) < std::abs(dims[b].src_stride);
               });
 
-    // 根据最大线程数选择block和grid维度
-    const size_t block_size = max_threads;
-    std::vector<bool> block_dim_choose(ndim, false);
+    constexpr size_t MAX_BLOCK_DIM = MAX_BLOCK_ARRAY_SIZE;
 
-    // 初始化计数器
     size_t block_elements = 1;
-    size_t block_src_elements = 1;
-    size_t block_dst_elements = 1;
-    size_t src_choose_idx = ndim;
-    size_t dst_choose_idx = ndim;
-
-    // 用于存储分割维度信息
-    std::vector<SplitDim> split_dims;
+    size_t chosen_block_dims = 0;
 
-    // 维度选择循环
-    while (src_choose_idx > 0 && dst_choose_idx > 0) {
-        // 获取当前需要处理的维度索引
-        size_t src_idx = src_strides_desc_idx[src_choose_idx - 1];
-        size_t dst_idx = dst_choose_idx - 1;
-
-        if (src_idx == dst_idx) {
-            // 源和目标维度相同，可以一起处理
-            size_t idx = src_idx;
-            size_t len = shape[idx];
-
-            // 检查是否可以将此维度完全添加到block中
-            if (block_elements * len <= block_size) {
-                // 选择此维度
-                block_dim_choose[idx] = true;
-                block_elements *= len;
-                block_src_elements *= len;
-                block_dst_elements *= len;
-                src_choose_idx--;
-                dst_choose_idx--;
-            } else {
-                // 需要分割此维度
-                size_t num_per_block = block_size / block_elements;
-
-                // 确保num_per_block > 0且len >= num_per_block
-                if (num_per_block > 0 && len >= num_per_block && num_per_block > 1) {
-                    size_t num_per_grid = (len + num_per_block - 1) / num_per_block; // 向上取整
-
-                    SplitDim split_dim = {
-                        idx,           // choose_idx
-                        num_per_block, // num_per_block
-                        num_per_grid,  // num_per_grid
-                        0,             // array_struct_idx_block (待更新)
-                        0,             // array_struct_idx_grid (待更新)
-                        len            // 原始维度长度
-                    };
-                    split_dims.push_back(split_dim);
-                }
-                break;
+    for (size_t i = 0; i < ndim; ++i) {
+        size_t dim_idx = dim_order[i];
+        size_t dim_len = shape[dim_idx];
+    
+        // 1️⃣ 尝试完整放入 block
+        if (chosen_block_dims < MAX_BLOCK_DIM &&
+            block_elements * dim_len <= (size_t)max_threads) {
+    
+            block_dim_choose[dim_idx] = true;
+            block_elements *= dim_len;
+            chosen_block_dims++;
+            continue;
+        }
+    
+        // 2️⃣ 尝试 split（⚠️ split 也会消耗 1 个 block dim）
+        if (block_elements > 1 && dim_len > 1) {
+    
+            // ⚠️ 关键：先检查 block_dim 上限
+            if (chosen_block_dims + 1 > MAX_BLOCK_DIM) {
+                break;  // 不能再加 block 维度了
             }
-        } else {
-            // 源和目标维度不同，需要分别处理
-            // 计算块比例
-            double src_div_dst = static_cast<double>(block_src_elements) / block_dst_elements;
-            double src_num_per_block = std::sqrt(block_size / (double)block_elements / src_div_dst);
-            double dst_num_per_block = src_num_per_block * src_div_dst;
-
-            size_t src_current_dim_len = shape[src_idx];
-            size_t dst_current_dim_len = shape[dst_idx];
-
-            if (static_cast<double>(src_current_dim_len) < src_num_per_block) {
-                // 源维度可以完全添加到block
-                block_dim_choose[src_idx] = true;
-                block_elements *= src_current_dim_len;
-                block_src_elements *= src_current_dim_len;
-                src_choose_idx--;
-            } else if (static_cast<double>(dst_current_dim_len) < dst_num_per_block) {
-                // 目标维度可以完全添加到block
-                block_dim_choose[dst_idx] = true;
-                block_elements *= dst_current_dim_len;
-                block_dst_elements *= dst_current_dim_len;
-                dst_choose_idx--;
-            } else {
-                // 需要分割源和目标维度
-                size_t src_num_per_block_int = static_cast<size_t>(std::floor(src_num_per_block));
-                size_t dst_num_per_block_int = static_cast<size_t>(std::floor(dst_num_per_block));
-
-                // 计算网格尺寸
-                size_t src_num_per_grid = (src_current_dim_len + src_num_per_block_int - 1) / src_num_per_block_int; // 向上取整
-                size_t dst_num_per_grid = (dst_current_dim_len + dst_num_per_block_int - 1) / dst_num_per_block_int; // 向上取整
-
-                // 处理源维度
-                if (src_num_per_block_int > 1) {
-                    if (src_num_per_grid == 1) {
-                        // 可以完全放入块
-                        block_dim_choose[src_idx] = true;
-                        block_elements *= src_current_dim_len;
-                        block_src_elements *= src_current_dim_len;
-                        src_choose_idx--;
-                    } else {
-                        // 需要分割
-                        SplitDim split_dim = {
-                            src_idx,               // choose_idx
-                            src_num_per_block_int, // num_per_block
-                            src_num_per_grid,      // num_per_grid
-                            0,                     // array_struct_idx_block (待更新)
-                            0,                     // array_struct_idx_grid (待更新)
-                            src_current_dim_len    // 原始维度长度
-                        };
-                        split_dims.push_back(split_dim);
-                    }
-                }
+    
+            size_t num_per_block =
+                std::min(dim_len, (size_t)max_threads / block_elements);
+    
+            if (num_per_block > 0) {
+                size_t num_per_grid =
+                    (dim_len + num_per_block - 1) / num_per_block;
+    
+                split_dims.push_back({
+                    dim_idx,
+                    num_per_block,
+                    num_per_grid,
+                    0,
+                    0,
+                    dim_len
+                });
+    
+                block_elements *= num_per_block;
+                chosen_block_dims++;   // split 占 1 个 block 维度
+            }
+            break;
+        }
+    }
+    
 
-                // 处理目标维度
-                if (dst_num_per_block_int > 1) {
-                    if (dst_num_per_grid == 1) {
-                        // 可以完全放入块
-                        block_dim_choose[dst_idx] = true;
-                        block_elements *= dst_current_dim_len;
-                        block_dst_elements *= dst_current_dim_len;
-                        dst_choose_idx--;
-                    } else {
-                        // 需要分割
-                        SplitDim split_dim = {
-                            dst_idx,               // choose_idx
-                            dst_num_per_block_int, // num_per_block
-                            dst_num_per_grid,      // num_per_grid
-                            0,                     // array_struct_idx_block (待更新)
-                            0,                     // array_struct_idx_grid (待更新)
-                            dst_current_dim_len    // 原始维度长度
-                        };
-                        split_dims.push_back(split_dim);
-                    }
-                }
 
-                break;
-            }
+    if (block_elements == 1 && ndim > 0) {
+        size_t dim_idx = dim_order[0];
+        size_t dim_len = shape[dim_idx];
+
+        if (dim_len <= (size_t)max_threads) {
+            block_dim_choose[dim_idx] = true;
+            block_elements = dim_len;
+        } else {
+            // 需要分割
+            size_t num_per_block = std::min(dim_len, (size_t)max_threads);
+            size_t num_per_grid = (dim_len + num_per_block - 1) / num_per_block;
+
+            SplitDim split_dim = {
+                dim_idx,
+                num_per_block,
+                num_per_grid,
+                0,
+                0,
+                dim_len};
+            split_dims.push_back(split_dim);
+            block_elements = num_per_block;
         }
     }
 
     // 准备block维度相关参数
     size_t block_dim = 0;
-    size_t block_len_total = 1;
+    size_t block_len_total = block_elements;
 
     std::vector<ARRAY_TYPE_SIZE> block_len;
     std::vector<ARRAY_TYPE_STRIDE> src_block_stride;
@@ -286,7 +233,6 @@ utils::Result<RearrangeParams> prepareRearrangeParams(const utils::RearrangeMeta
             src_block_stride.push_back(dims[i].src_stride);
             dst_block_stride.push_back(dims[i].dst_stride);
             block_dim += 1;
-            block_len_total *= shape[i];
         }
 
         // 处理分割维度的block部分
@@ -295,9 +241,8 @@ utils::Result<RearrangeParams> prepareRearrangeParams(const utils::RearrangeMeta
                 block_len.push_back(split_dims[j].num_per_block);
                 src_block_stride.push_back(dims[i].src_stride);
                 dst_block_stride.push_back(dims[i].dst_stride);
-                split_dims[j].array_struct_idx_block = block_dim;
+                split_dims[j].array_struct_idx_block = static_cast<int>(block_dim);
                 block_dim += 1;
-                block_len_total *= split_dims[j].num_per_block;
             }
         }
     }
@@ -314,7 +259,8 @@ utils::Result<RearrangeParams> prepareRearrangeParams(const utils::RearrangeMeta
                     grid_len.push_back(split_dims[j].num_per_grid);
                     src_grid_stride.push_back(dims[i].src_stride * split_dims[j].num_per_block);
                     dst_grid_stride.push_back(dims[i].dst_stride * split_dims[j].num_per_block);
-                    split_dims[j].array_struct_idx_grid = grid_len.size() - 1;
+                    split_dims[j].array_struct_idx_grid = static_cast<int>(grid_len.size() - 1);
+                    break;
                 }
             }
 
@@ -348,6 +294,10 @@ utils::Result<RearrangeParams> prepareRearrangeParams(const utils::RearrangeMeta
         constraint.grid_div_block = split_dims[i].num_per_block;
         constraint.total_len = split_dims[i].dim_len;
         constraints.push_back(constraint);
+
+        if (constraints.size() >= 2) {
+            break;
+        }
     }
 
     // 设置参数
@@ -385,13 +335,13 @@ infiniStatus_t launchKernel(
     // 创建非const的临时变量
     size_t block_dim = params.block_dim;
     size_t block_len_total = params.block_len_total;
- 
+
     // 计算对齐后的线程块大小（Block Size）以适配 MUSA 架构的Warp特性
     // - MUSA 架构以 32 线程为基本调度单位（1个 Warp）
     // - 通过向上取整到最近的 32 的倍数，确保线程块包含完整的 Warp
     // - MUSA 似乎不支持非 32 整数倍的计算
     size_t aligned_block_size = ((block_len_total + 31) / 32) * 32;
-    block_len_total = aligned_block_size;
+    // block_len_total = aligned_block_size;
 
     // 确保对齐后的线程块大小不超过硬件/模板限制
     if (aligned_block_size > BLOCK_SIZE) {
@@ -434,7 +384,7 @@ infiniStatus_t launchKernel(
     // - 向上取整，数学等效：ceil(n / 32) * 32
     CHECK_OR_RETURN(musaLaunchKernel(
                         kernel_func,
-                        grid_size, aligned_block_size, 
+                        static_cast<unsigned int>(grid_size), static_cast<unsigned int>(aligned_block_size),
                         args, 0, stream)
                         == musaSuccess,
                     INFINI_STATUS_INTERNAL_ERROR);
@@ -458,11 +408,6 @@ infiniStatus_t Descriptor::calculate(
 
     // 如果没有维度，直接进行内存拷贝
     if (_meta.ndim() == 0) {
-        auto err = musaMemcpyAsync(y, x, _meta.unit(), musaMemcpyDeviceToDevice, musa_stream);
-        if (err != musaSuccess) {
-            return INFINI_STATUS_INTERNAL_ERROR;
-        }
-
         CHECK_OR_RETURN(musaMemcpyAsync(y, x, _meta.unit(), musaMemcpyDeviceToDevice, musa_stream) == musaSuccess,
                         INFINI_STATUS_INTERNAL_ERROR);
         return INFINI_STATUS_SUCCESS;
@@ -497,7 +442,6 @@ infiniStatus_t Descriptor::calculate(
     } else if (block_size <= MOORE_BLOCK_SIZE_1024) {
         status = launchKernel<MOORE_BLOCK_SIZE_1024>(y, x, grid_size, params, _meta.unit(), musa_stream);
     } else {
-        std::cerr << "[ERROR] block_size=" << block_size << " exceeds max supported" << std::endl;
         return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
     }
 

From ee9f95146d0d111c9c39cc6d6f099a5ff256c749 Mon Sep 17 00:00:00 2001
From: zhushuang <974198603@qq.com>
Date: Wed, 21 Jan 2026 14:20:04 +0800
Subject: [PATCH 2/2] issue/949 - feat: add silu_and_mul for moore gpu with
 test pass

---
 include/infinicore/ops.hpp                    |   1 +
 include/infinicore/ops/silu_and_mul.hpp       |  18 ++
 include/infiniop.h                            |   1 +
 include/infiniop/ops/silu_and_mul.h           |  49 ++++++
 python/infinicore/__init__.py                 |   1 +
 python/infinicore/nn/functional/__init__.py   |   2 +
 .../infinicore/nn/functional/silu_and_mul.py  |  19 ++
 .../ops/silu_and_mul/silu_and_mul.cc          |  43 +++++
 .../ops/silu_and_mul/silu_and_mul_infiniop.cc |  60 +++++++
 src/infinicore/pybind11/ops.hpp               |   2 +
 src/infinicore/pybind11/ops/silu_and_mul.hpp  |  31 ++++
 src/infiniop/ops/silu_and_mul/info.h          |  54 ++++++
 .../silu_and_mul/moore/silu_and_mul_moore.h   |   8 +
 .../silu_and_mul/moore/silu_and_mul_moore.mu  | 123 +++++++++++++
 src/infiniop/ops/silu_and_mul/operator.cc     |  82 +++++++++
 src/infiniop/ops/silu_and_mul/silu_and_mul.h  |  46 +++++
 test/infinicore/ops/silu_and_mul.py           | 139 +++++++++++++++
 test/infiniop/libinfiniop/op_register.py      |  39 +++++
 test/infiniop/silu_and_mul.py                 | 164 ++++++++++++++++++
 19 files changed, 882 insertions(+)
 create mode 100644 include/infinicore/ops/silu_and_mul.hpp
 create mode 100644 include/infiniop/ops/silu_and_mul.h
 create mode 100644 python/infinicore/nn/functional/silu_and_mul.py
 create mode 100644 src/infinicore/ops/silu_and_mul/silu_and_mul.cc
 create mode 100644 src/infinicore/ops/silu_and_mul/silu_and_mul_infiniop.cc
 create mode 100644 src/infinicore/pybind11/ops/silu_and_mul.hpp
 create mode 100644 src/infiniop/ops/silu_and_mul/info.h
 create mode 100644 src/infiniop/ops/silu_and_mul/moore/silu_and_mul_moore.h
 create mode 100644 src/infiniop/ops/silu_and_mul/moore/silu_and_mul_moore.mu
 create mode 100644 src/infiniop/ops/silu_and_mul/operator.cc
 create mode 100644 src/infiniop/ops/silu_and_mul/silu_and_mul.h
 create mode 100644 test/infinicore/ops/silu_and_mul.py
 create mode 100644 test/infiniop/silu_and_mul.py

diff --git a/include/infinicore/ops.hpp b/include/infinicore/ops.hpp
index a7249ec9d..2791c41fe 100644
--- a/include/infinicore/ops.hpp
+++ b/include/infinicore/ops.hpp
@@ -15,3 +15,4 @@
 #include "ops/rope.hpp"
 #include "ops/silu.hpp"
 #include "ops/swiglu.hpp"
+#include "ops/silu_and_mul.hpp"
diff --git a/include/infinicore/ops/silu_and_mul.hpp b/include/infinicore/ops/silu_and_mul.hpp
new file mode 100644
index 000000000..11a49d252
--- /dev/null
+++ b/include/infinicore/ops/silu_and_mul.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "../device.hpp"
+#include "../graph/graph.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+// 这个宏会自动定义 SiluAndMul 类，并包含：
+// execute, dispatcher, plan_dispatcher, run_dispatcher, cleanup_dispatcher
+// 以及对应的 schema 类型定义
+INFINICORE_GRAPH_OP_CLASS(SiluAndMul, Tensor, Tensor);
+
+// 全局辅助函数
+Tensor silu_and_mul(Tensor x);
+void silu_and_mul_(Tensor out, Tensor x);
+
+} // namespace infinicore::op
diff --git a/include/infiniop.h b/include/infiniop.h
index c0a09fcb4..9c786d51e 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -34,6 +34,7 @@
 #include "infiniop/ops/topkrouter.h"
 #include "infiniop/ops/topksoftmax.h"
 #include "infiniop/ops/zeros.h"
+#include "infiniop/ops/silu_and_mul.h"
 #include "infiniop/tensor_descriptor.h"
 
 #endif // __INFINIOP_API_H__
diff --git a/include/infiniop/ops/silu_and_mul.h b/include/infiniop/ops/silu_and_mul.h
new file mode 100644
index 000000000..e19675b2a
--- /dev/null
+++ b/include/infiniop/ops/silu_and_mul.h
@@ -0,0 +1,49 @@
+#ifndef __INFINIOP_SILU_AND_MUL_API_H__
+#define __INFINIOP_SILU_AND_MUL_API_H__
+
+#include "../operator_descriptor.h"
+
+// 定义描述符类型
+typedef struct InfiniopDescriptor *infiniopSiluAndMulDescriptor_t;
+
+/**
+ * @brief 创建 SiluAndMul 算子描述符
+ * * 公式: output = silu(input_front) * input_back
+ * 其中 input 形状为 [..., 2*d], output 形状为 [..., d]
+ */
+__C __export infiniStatus_t infiniopCreateSiluAndMulDescriptor(
+    infiniopHandle_t handle,
+    infiniopSiluAndMulDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output,
+    infiniopTensorDescriptor_t input);
+
+/**
+ * @brief 获取算子执行所需的临时空间大小
+ */
+__C __export infiniStatus_t infiniopGetSiluAndMulWorkspaceSize(
+    infiniopSiluAndMulDescriptor_t desc, 
+    size_t *size);
+
+/**
+ * @brief 执行 SiluAndMul 计算
+ * * @param workspace 临时空间指针
+ * @param workspace_size 临时空间大小
+ * @param output 输出张量数据指针 [..., d]
+ * @param input 输入张量数据指针 [..., 2*d]
+ * @param stream 硬件流指针 (如 musaStream_t)
+ */
+__C __export infiniStatus_t infiniopSiluAndMul(
+    infiniopSiluAndMulDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream);
+
+/**
+ * @brief 销毁描述符并释放相关资源
+ */
+__C __export infiniStatus_t infiniopDestroySiluAndMulDescriptor(
+    infiniopSiluAndMulDescriptor_t desc);
+
+#endif // __INFINIOP_SILU_AND_MUL_API_H__
diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py
index c6b01d5aa..e0a1e34f9 100644
--- a/python/infinicore/__init__.py
+++ b/python/infinicore/__init__.py
@@ -3,6 +3,7 @@
 import infinicore.context as context
 import infinicore.nn as nn
 
+
 # Import context functions
 from infinicore.context import (
     get_device,
diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py
index 255079790..9131d70a4 100644
--- a/python/infinicore/nn/functional/__init__.py
+++ b/python/infinicore/nn/functional/__init__.py
@@ -6,6 +6,7 @@
 from .rope import RopeAlgo, rope
 from .silu import silu
 from .swiglu import swiglu
+from .silu_and_mul import silu_and_mul
 
 __all__ = [
     "causal_softmax",
@@ -17,4 +18,5 @@
     "embedding",
     "rope",
     "RopeAlgo",
+    "silu_and_mul",
 ]
diff --git a/python/infinicore/nn/functional/silu_and_mul.py b/python/infinicore/nn/functional/silu_and_mul.py
new file mode 100644
index 000000000..53fe66dfd
--- /dev/null
+++ b/python/infinicore/nn/functional/silu_and_mul.py
@@ -0,0 +1,19 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def silu_and_mul(input: Tensor, out=None) -> Tensor:
+    r"""Apply the SiLU and Mul (SwiGLU) function.
+    
+    Formula: output = SiLU(input_gate) * input_up
+    Input shape: [..., 2*d], Output shape: [..., d]
+    """
+
+    if out is None:
+        # 调用 C++ 非原地接口，内部处理输出 Tensor 的创建
+        return Tensor(_infinicore.silu_and_mul(input._underlying))
+
+    # 调用 C++ 原地/指定输出接口
+    _infinicore.silu_and_mul_(out._underlying, input._underlying)
+
+    return out
diff --git a/src/infinicore/ops/silu_and_mul/silu_and_mul.cc b/src/infinicore/ops/silu_and_mul/silu_and_mul.cc
new file mode 100644
index 000000000..ed871e483
--- /dev/null
+++ b/src/infinicore/ops/silu_and_mul/silu_and_mul.cc
@@ -0,0 +1,43 @@
+#include "infinicore/ops/silu_and_mul.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+// 实现分发器
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(SiluAndMul);
+
+// 构造函数：校验设备并分发
+SiluAndMul::SiluAndMul(Tensor out, Tensor x) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, x);
+    // 根据设备类型（如 Moore, Cuda 等）路由到具体的实现
+    INFINICORE_GRAPH_OP_DISPATCH(out->device().getType(), out, x);
+}
+
+// 执行接口：在图模式下记录或在即时模式下运行
+void SiluAndMul::execute(Tensor out, Tensor x) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(SiluAndMul, out, x);
+}
+
+// 非原地接口：负责推导输出形状并分配内存
+Tensor silu_and_mul(Tensor x) {
+    Shape shape = x->shape();
+    size_t ndim = x->ndim();
+    
+    // SwiGLU 逻辑：输出最后一维是输入的一半
+    if (shape[ndim - 1] % 2 != 0) {
+        throw std::runtime_error("SiluAndMul input last dim must be even.");
+    }
+    shape[ndim - 1] /= 2;
+
+    // 创建输出张量
+    auto out = Tensor::empty(shape, x->dtype(), x->device());
+    silu_and_mul_(out, x);
+    return out;
+}
+
+// 原地/指定输出接口
+void silu_and_mul_(Tensor out, Tensor x) {
+    SiluAndMul::execute(out, x);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/silu_and_mul/silu_and_mul_infiniop.cc b/src/infinicore/ops/silu_and_mul/silu_and_mul_infiniop.cc
new file mode 100644
index 000000000..c141e127e
--- /dev/null
+++ b/src/infinicore/ops/silu_and_mul/silu_and_mul_infiniop.cc
@@ -0,0 +1,60 @@
+#include "../infiniop_impl.hpp"
+#include "infinicore/ops/silu_and_mul.hpp"
+
+namespace infinicore::op::silu_and_mul_impl::infiniop {
+
+// 定义可缓存的描述符，用于避免频繁创建/销毁 infiniopDescriptor
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, SiluAndMul, 100);
+
+// 定义图执行模式所需的元数据
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, output, input;
+};
+
+// 预执行阶段：创建描述符并关联张量
+void *plan(Tensor output, Tensor input) {
+    // 根据张量的描述符（形状、类型等）生成唯一 Hash Seed
+    size_t seed = hash_combine(output, input);
+
+    // 获取缓存的描述符或创建新描述符
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, SiluAndMul,
+        seed, output->desc(), input->desc());
+
+    // 分配工作空间张量（SwiGLU 如果需要的话，由 descriptor->workspace_size 决定）
+    INFINIOP_WORKSPACE_TENSOR(workspace, SiluAndMul, descriptor);
+
+    auto planned = new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(output),
+        graph::GraphTensor(input)};
+
+    return planned;
+}
+
+// 实际执行阶段
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    // 调用我们在之前步骤中实现的 infiniop 接口
+    INFINICORE_CHECK_ERROR(infiniopSiluAndMul(
+        planned->descriptor->desc, 
+        planned->workspace->data(), 
+        planned->workspace->numel(),
+        planned->output->data(), 
+        planned->input->data(), 
+        context::getStream()));
+}
+
+// 清理逻辑
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+// 注册算子到所有支持的设备
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(SiluAndMul, &plan, &run, &cleanup);
+
+} // namespace infinicore::op::silu_and_mul_impl::infiniop
diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp
index 3d6ebe79a..4340b5b8a 100644
--- a/src/infinicore/pybind11/ops.hpp
+++ b/src/infinicore/pybind11/ops.hpp
@@ -19,6 +19,7 @@
 #include "ops/rope.hpp"
 #include "ops/silu.hpp"
 #include "ops/swiglu.hpp"
+#include "ops/silu_and_mul.hpp"
 
 namespace py = pybind11;
 
@@ -42,6 +43,7 @@ inline void bind(py::module &m) {
     bind_swiglu(m);
     bind_rope(m);
     bind_embedding(m);
+    bind_silu_and_mul(m);
 }
 
 } // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/silu_and_mul.hpp b/src/infinicore/pybind11/ops/silu_and_mul.hpp
new file mode 100644
index 000000000..379575868
--- /dev/null
+++ b/src/infinicore/pybind11/ops/silu_and_mul.hpp
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/silu_and_mul.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_silu_and_mul(py::module &m) {
+    // 绑定非原地函数: Tensor silu_and_mul(Tensor input)
+    m.def("silu_and_mul",
+          &op::silu_and_mul,
+          py::arg("input"),
+          R"doc(
+          SiLU and Mul (SwiGLU) activation function.
+          Input should be [..., 2*d], output will be [..., d].
+          )doc");
+
+    // 绑定原地/指定输出函数: void silu_and_mul_(Tensor output, Tensor input)
+    m.def("silu_and_mul_",
+          &op::silu_and_mul_,
+          py::arg("output"),
+          py::arg("input"),
+          R"doc(
+          In-place or destination-specified SiLU and Mul (SwiGLU) activation function.
+          )doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infiniop/ops/silu_and_mul/info.h b/src/infiniop/ops/silu_and_mul/info.h
new file mode 100644
index 000000000..c7be1ddff
--- /dev/null
+++ b/src/infiniop/ops/silu_and_mul/info.h
@@ -0,0 +1,54 @@
+#ifndef __SILU_AND_MUL_INFO_H__
+#define __SILU_AND_MUL_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::silu_and_mul {
+
+    class SiluAndMulInfo {
+        SiluAndMulInfo() = default;
+    
+    public:
+        infiniDtype_t dtype;
+        size_t batch_size;
+        size_t out_hidden_dim;
+    
+        static utils::Result<SiluAndMulInfo> create(infiniopTensorDescriptor_t y_desc, infiniopTensorDescriptor_t x_desc) {
+            auto dtype = y_desc->dtype();
+
+            auto x_shape = x_desc->shape();
+            auto y_shape = y_desc->shape();
+            auto ndim = x_desc->ndim();
+    
+            if (ndim != y_desc->ndim()) {
+                return INFINI_STATUS_BAD_PARAM;
+            }
+    
+            if (x_shape[ndim - 1] != 2 * y_shape[ndim - 1]) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+    
+            size_t batch = 1;
+            for (int i = 0; i < (int)ndim - 1; ++i) {
+                if (x_shape[i] != y_shape[i]) {
+                    return INFINI_STATUS_BAD_TENSOR_SHAPE;
+                }
+                batch *= y_shape[i];
+            }
+    
+            return utils::Result<SiluAndMulInfo>(SiluAndMulInfo{
+                dtype,
+                batch, 
+                y_shape[ndim - 1]});
+        }
+    
+    private:
+        SiluAndMulInfo(infiniDtype_t dtype, size_t batch, size_t hidden)
+            : dtype(dtype), batch_size(batch), out_hidden_dim(hidden) {}
+    };
+
+}
+
+#endif // __SILU_AND_MUL_INFO_H__
diff --git a/src/infiniop/ops/silu_and_mul/moore/silu_and_mul_moore.h b/src/infiniop/ops/silu_and_mul/moore/silu_and_mul_moore.h
new file mode 100644
index 000000000..2789e2de6
--- /dev/null
+++ b/src/infiniop/ops/silu_and_mul/moore/silu_and_mul_moore.h
@@ -0,0 +1,8 @@
+#ifndef __SILU_ADN_MUL_MOORE_API_H__
+#define __SILU_ADN_MUL_MOORE_API_H__
+
+#include "../silu_and_mul.h"
+
+DESCRIPTOR(moore)
+
+#endif // __SILU_ADN_MUL_MOORE_API_H__
diff --git a/src/infiniop/ops/silu_and_mul/moore/silu_and_mul_moore.mu b/src/infiniop/ops/silu_and_mul/moore/silu_and_mul_moore.mu
new file mode 100644
index 000000000..48fcb9609
--- /dev/null
+++ b/src/infiniop/ops/silu_and_mul/moore/silu_and_mul_moore.mu
@@ -0,0 +1,123 @@
+#include "../../../devices/moore/moore_common.h"
+#include "../../../devices/moore/moore_handle.h"
+#include "silu_and_mul_moore.h"
+
+#include <musa_bf16.h>
+#include <memory>
+
+namespace op::silu_and_mul::moore {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::moore::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+    
+    if (!desc_ptr) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = y_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+    if (x_desc->dtype() != dtype) {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    auto result = SiluAndMulInfo::create(y_desc, x_desc);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    *desc_ptr = new Descriptor(
+        new Opaque{handle->internal()},
+        std::move(info),
+        0, 
+        handle->device, handle->device_id);
+        
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename T>
+infiniStatus_t calculate_impl(
+    const SiluAndMulInfo &info,
+    std::shared_ptr<device::moore::Handle::Internal> &internal,
+    void *y,
+    const void *x,
+    void *stream) {
+
+    return internal->useMudnn(
+        (musaStream_t)stream,
+        [&](::musa::dnn::Handle &mudnn_handle) -> infiniStatus_t {
+            
+            ::musa::dnn::Tensor x_t, y_t;
+
+            if constexpr (std::is_same_v<T, half>) {
+                x_t.SetType(::musa::dnn::Tensor::Type::HALF);
+                y_t.SetType(::musa::dnn::Tensor::Type::HALF);
+            } else if constexpr (std::is_same_v<T, __mt_bfloat16>) {
+                x_t.SetType(::musa::dnn::Tensor::Type::BFLOAT16);
+                y_t.SetType(::musa::dnn::Tensor::Type::BFLOAT16);
+            } else {
+                x_t.SetType(::musa::dnn::Tensor::Type::FLOAT);
+                y_t.SetType(::musa::dnn::Tensor::Type::FLOAT);
+            }
+
+            x_t.SetAddr(const_cast<void *>(x));
+            y_t.SetAddr(y);
+
+            // --- Construct 2D dimension information ---
+            // Explicitly distinguish between Batch and Hidden dimensions
+            int64_t b = static_cast<int64_t>(info.batch_size);
+            int64_t h = static_cast<int64_t>(info.out_hidden_dim);
+
+            // Input x logical shape is [batch, 2 * hidden]
+            std::array<int64_t, 2> x_dims = {b, h * 2};
+            std::array<int64_t, 2> x_strides = {h * 2, 1};
+
+            // Output y logical shape is [batch, hidden]
+            std::array<int64_t, 2> y_dims = {b, h};
+            std::array<int64_t, 2> y_strides = {h, 1};
+
+            x_t.SetNdInfo(2, x_dims.data(), x_strides.data());
+            y_t.SetNdInfo(2, y_dims.data(), y_strides.data());
+
+            // Invoke muDNN SwiGLU
+            // muDNN will split each row (length 2*h) internally,
+            // muDNN treats the first h elements of input x as the 'gate' 
+            // and the following h elements as the 'up' projection.
+            ::musa::dnn::SwiGlu swiglu;
+            swiglu.Run(mudnn_handle, y_t, x_t);
+
+            return INFINI_STATUS_SUCCESS;
+        });
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size,
+    void *y, const void *x,
+    void *stream) const {
+    
+    infiniDtype_t dtype = _info.dtype;
+
+    switch (dtype) {
+        case INFINI_DTYPE_F16:
+            return calculate_impl<half>(_info, _opaque->internal, y, x, stream);
+        case INFINI_DTYPE_F32:
+            return calculate_impl<float>(_info, _opaque->internal, y, x, stream);
+        case INFINI_DTYPE_BF16:
+            return calculate_impl<__mt_bfloat16>(_info, _opaque->internal, y, x, stream);
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+} // namespace op::silu_and_mul::moore
diff --git a/src/infiniop/ops/silu_and_mul/operator.cc b/src/infiniop/ops/silu_and_mul/operator.cc
new file mode 100644
index 000000000..85f651607
--- /dev/null
+++ b/src/infiniop/ops/silu_and_mul/operator.cc
@@ -0,0 +1,82 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/silu_and_mul.h"
+
+#ifdef ENABLE_MOORE_API
+#include "moore/silu_and_mul_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateSiluAndMulDescriptor(
+    infiniopHandle_t handle,
+    infiniopSiluAndMulDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        return op::silu_and_mul::NAMESPACE::Descriptor::create(                \
+            handle,                                                            \
+            reinterpret_cast<op::silu_and_mul::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                            \
+            x_desc);
+
+    switch (handle->device) {
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+
+__C infiniStatus_t infiniopGetSiluAndMulWorkspaceSize(infiniopSiluAndMulDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                   \
+    case CASE:                                                                                 \
+        *size = reinterpret_cast<op::silu_and_mul::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+
+__C infiniStatus_t infiniopSiluAndMul(
+    infiniopSiluAndMulDescriptor_t desc,
+    void *workspace, size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                       \
+    case CASE:                                                                           \
+        return reinterpret_cast<op::silu_and_mul::NAMESPACE::Descriptor *>(desc)->calculate( \
+            workspace, workspace_size, y, x, stream);
+
+    switch (desc->device_type) {
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+
+__C infiniStatus_t infiniopDestroySiluAndMulDescriptor(infiniopSiluAndMulDescriptor_t desc) {
+
+#define DESTROY(CASE, NAMESPACE)                                             \
+    case CASE:                                                               \
+        delete reinterpret_cast<op::silu_and_mul::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_MOORE_API
+        DESTROY(INFINI_DEVICE_MOORE, moore);
+#endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
diff --git a/src/infiniop/ops/silu_and_mul/silu_and_mul.h b/src/infiniop/ops/silu_and_mul/silu_and_mul.h
new file mode 100644
index 000000000..ced75e68d
--- /dev/null
+++ b/src/infiniop/ops/silu_and_mul/silu_and_mul.h
@@ -0,0 +1,46 @@
+#ifndef SILU_AND_MUL_H
+#define SILU_AND_MUL_H
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::silu_and_mul::NAMESPACE {                      \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        SiluAndMulInfo _info;                                    \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            Opaque *opaque,                                      \
+            SiluAndMulInfo info,                                 \
+            size_t workspace_size,                               \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(info),                                       \
+              _workspace_size(workspace_size) {}                 \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t y_desc,                   \
+            infiniopTensorDescriptor_t x_desc);                  \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace, size_t workspace_size,              \
+            void *y,                                             \
+            const void *x,                                       \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // SILU_AND_MUL_H
diff --git a/test/infinicore/ops/silu_and_mul.py b/test/infinicore/ops/silu_and_mul.py
new file mode 100644
index 000000000..0d7bfb8bf
--- /dev/null
+++ b/test/infinicore/ops/silu_and_mul.py
@@ -0,0 +1,139 @@
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import torch
+import infinicore
+from framework import (
+    BaseOperatorTest,
+    TensorSpec,
+    TestCase,
+    GenericTestRunner,
+    is_broadcast,
+)
+
+# ==============================================================================
+# Operator-specific configuration
+# ==============================================================================
+
+# Test cases format: (input_shape, input_strides, output_strides)
+# SiLUAndMul: Input (..., 2*d) -> Output (..., d)
+_TEST_CASES_DATA = [
+    # Basic 2D: [2, 4] -> [2, 2]
+    ((2, 4), None, None),
+    # 2D Large: [1024, 1024] -> [1024, 512]
+    ((1024, 1024), None, None),
+    # 3D: [2, 4, 8] -> [2, 4, 4]
+    ((2, 4, 8), None, None),
+    # LLM typical hidden size (e.g., Llama SwiGLU: intermediate_size=11008)
+    # [1, 11008*2] -> [1, 11008]
+    ((1, 22016), None, None),
+    # Strided tensors
+    ((2, 4, 256), None, None),
+]
+
+# Tolerance configuration
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 1e-3, "rtol": 1e-3},
+    infinicore.float32: {"atol": 1e-5, "rtol": 1e-5},
+    infinicore.bfloat16: {"atol": 5e-3, "rtol": 1e-2},
+}
+
+# Data types to test
+_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]
+
+
+def parse_test_cases():
+    """
+    Parse SiLUAndMul test case data.
+    Input: [..., 2*d], Output: [..., d]
+    """
+    test_cases = []
+
+    for data in _TEST_CASES_DATA:
+        input_shape = data[0]
+        input_strides = data[1] if len(data) > 1 else None
+        output_strides = data[2] if len(data) > 2 else None
+
+        # 推导输出形状：最后一维减半
+        output_shape = list(input_shape)
+        output_shape[-1] //= 2
+        output_shape = tuple(output_shape)
+
+        # SiLUAndMul 不支持原地 (In-place on input)，因为形状不匹配
+        # 但支持指定输出 Tensor (out=output)
+        output_supports_explicit_out = not is_broadcast(output_strides)
+
+        for dtype in _TENSOR_DTYPES:
+            tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4})
+
+            input_spec = TensorSpec.from_tensor(input_shape, input_strides, dtype)
+            output_spec = TensorSpec.from_tensor(output_shape, output_strides, dtype)
+
+            # Test Case 1: Out-of-place (return value)
+            test_cases.append(
+                TestCase(
+                    inputs=[input_spec],
+                    kwargs={},
+                    output_spec=None,
+                    comparison_target=None,
+                    tolerance=tolerance,
+                    description=f"SiLUAndMul - OUT_OF_PLACE",
+                )
+            )
+
+            # Test Case 2: Explicit output tensor (silu_and_mul(input, out=output))
+            if output_supports_explicit_out:
+                test_cases.append(
+                    TestCase(
+                        inputs=[input_spec],
+                        kwargs=None,
+                        output_spec=output_spec,
+                        comparison_target="out",
+                        tolerance=tolerance,
+                        description=f"SiLUAndMul - OUT_PARAM",
+                    )
+                )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    """SiLUAndMul operator test (SwiGLU)"""
+
+    def __init__(self):
+        super().__init__("SiLUAndMul")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, input, out=None, **kwargs):
+        """PyTorch SwiGLU reference: SiLU(x_gate) * x_up"""
+        d = input.shape[-1] // 2
+        # 将最后一维切分为两部分
+        gate, up = torch.split(input, [d, d], dim=-1)
+        result = torch.nn.functional.silu(gate) * up
+
+        if out is not None:
+            out.copy_(result)
+            return out
+        return result
+
+    def infinicore_operator(self, input, out=None, **kwargs):
+        """InfiniCore SiLUAndMul implementation"""
+    
+        import infinicore.nn.functional as F
+
+        return F.silu_and_mul(input, out=out)
+
+
+
+def main():
+    """Main entry point"""
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index 618be2b05..c034fa527 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -1144,3 +1144,42 @@ def paged_attention_prefill_(lib):
     lib.infiniopDestroyPagedAttentionPrefillDescriptor.argtypes = [
         infiniopOperatorDescriptor_t,
     ]
+
+@OpRegister.operator
+def silu_and_mul(lib):
+    # 1. 注册创建描述符的接口
+    # 参数: handle, desc_ptr, output_desc, input_desc
+    lib.infiniopCreateSiluAndMulDescriptor.restype = c_int32
+    lib.infiniopCreateSiluAndMulDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    # 2. 注册获取 Workspace 大小的接口
+    # 参数: desc, size_ptr
+    lib.infiniopGetSiluAndMulWorkspaceSize.restype = c_int32
+    lib.infiniopGetSiluAndMulWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    # 3. 注册执行计算的接口
+    # 参数: desc, workspace, workspace_size, output_ptr, input_ptr, stream
+    lib.infiniopSiluAndMul.restype = c_int32
+    lib.infiniopSiluAndMul.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    # 4. 注册销毁描述符的接口
+    # 参数: desc
+    lib.infiniopDestroySiluAndMulDescriptor.restype = c_int32
+    lib.infiniopDestroySiluAndMulDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
diff --git a/test/infiniop/silu_and_mul.py b/test/infiniop/silu_and_mul.py
new file mode 100644
index 000000000..3210a2b91
--- /dev/null
+++ b/test/infiniop/silu_and_mul.py
@@ -0,0 +1,164 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# Format: (input_shape, output_shape)
+# Referencing vLLM kernel Silu_and_Mul interface:
+# input_shape is [..., 2*d], output_shape is [..., d]
+_TEST_CASES = [
+    # input_shape, output_shape
+    ((2, 8), (2, 4)),
+    ((1024, 1024), (1024, 512)),
+    ((16, 8192), (16, 4096)),
+    ((2, 128, 2048), (2, 128, 1024)),
+    ((8, 1, 4096), (8, 1, 2048)),
+    ((2, 4, 16, 256), (2, 4, 16, 128)),
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 100
+
+# PyTorch reference: silu(gate) * up where [gate, up] = split(input)
+def silu_and_mul_torch(out, input_tensor):
+    """
+    Computes the SwiGLU activation function: SiLU(gate) * up.
+    """
+    # Split the last dimension into two halves: 
+    # the first half is 'gate', the second is 'up'
+    d = input_tensor.shape[-1] // 2
+    gate = input_tensor[..., :d]
+    up   = input_tensor[..., d:]
+    
+    # Apply SiLU to the gate and multiply by the up projection
+    torch.mul(torch.nn.functional.silu(gate), up, out=out)
+
+# ==============================================================================
+#  Test Logic
+# ==============================================================================
+def test(
+    handle,
+    device,
+    input_shape,
+    output_shape,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    print(
+        f"Testing SiluAndMul on {InfiniDeviceNames[device]} with "
+        f"input_shape:{input_shape} output_shape:{output_shape} dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    a = TestTensor(input_shape, None, dtype, device)
+    c = TestTensor(output_shape, None, dtype, device, mode="zeros")
+    ans = TestTensor(output_shape, None, dtype, device, mode="zeros")
+
+    # Only support contiguous Tensor
+    if not (a.torch_tensor().is_contiguous() and 
+            c.torch_tensor().is_contiguous() and 
+            ans.torch_tensor().is_contiguous()):
+        raise ValueError("This operator only supports contiguous memory layout.")
+
+    # PyTorch answer reference
+    def torch_silu_and_mul_reference():
+        silu_and_mul_torch(ans.torch_tensor(), a.torch_tensor())
+
+    torch_silu_and_mul_reference()
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateSiluAndMulDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c.descriptor,
+            a.descriptor,
+        )
+    )
+
+    for tensor in [a, c]:
+        tensor.destroy_desc()
+
+    # Workspace
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetSiluAndMulWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, device)
+
+    def lib_op():
+        check_error(
+            LIBINFINIOP.infiniopSiluAndMul(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                c.data(),
+                a.data(),
+                None,
+            )
+        )
+
+    lib_op()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+
+    if DEBUG:
+        debug(c.actual_tensor(), ans.torch_tensor(), atol=atol, rtol=rtol)
+
+    assert torch.allclose(c.actual_tensor(), ans.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        profile_operation("PyTorch", lambda: torch_silu_and_mul_reference(), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_op(), device, NUM_PRERUN, NUM_ITERATIONS)
+
+    check_error(LIBINFINIOP.infiniopDestroySiluAndMulDescriptor(descriptor))
+
+
+# ==============================================================================
+#  Main Execution
+# ==============================================================================
+if __name__ == "__main__":
+    args = get_args()
+
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mSiluAndMul Test passed!\033[0m")