amd · albiol2004 · Apr 9, 2026
@@ -59,7 +59,7 @@ The IRON Python API for Ryzen™ AI NPUs is described in the following paper:
 | [Reduction]() | Reduction | bfloat16 | | | 🟡 |  |
 | [Dequant](./aie_kernels/generic/expand.cc) | Dequant Q4NX from [AWQ](https://github.com/mit-han-lab/llm-awq) to bfloat16 | bfloat16 | ✓ | ✓ | 🟢 | [iron/operators/dequant/](./iron/operators/dequant/) |
 | [RELU](./aie_kernels/aie2/relu.cc) | RELU | bfloat16 | ✓ | ✓ | 🟢 | [iron/operators/relu/](./iron/operators/relu/) |
-| [Leaky RELU](./aie_kernels/aie2p/leaky_relu.cc) (WIP) | Leaky RELU kernel | bfloat16 | | ✓ | ⚪ | [iron/operators/leaky_relu/](./iron/operators/leaky_relu/) |
+| [Leaky RELU](./aie_kernels/aie2/leaky_relu.cc) | Leaky RELU | bfloat16 | ✓ | ✓ | 🟢 | [iron/operators/leaky_relu/](./iron/operators/leaky_relu/) |
 | [GELU](./aie_kernels/aie2/gelu.cc) | GELU | bfloat16 | ✓ | ✓ | 🟢 | [iron/operators/gelu/](./iron/operators/gelu/) |
 | [LayerNorm](./aie_kernels/aie2/layer_norm.cc) | LayerNorm | bfloat16 | ✓ | ✓ | 🟢 | [iron/operators/layer_norm/](./iron/operators/layer_norm/) |
 | [Convolution]() | Convolution | bfloat16 | | | 🟡 |  |

@@ -0,0 +1,47 @@
+// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "../aie_kernel_utils.h"
+
+#include <aie_api/aie.hpp>
+#include <stdint.h>
+
+using namespace aie;
+
+void leaky_relu_vectorized_bf16(bfloat16 *restrict a,
+                                bfloat16 *restrict c,
+                                const int32_t vector_size,
+                                const bfloat16 alpha)
+{
+    event0();
+
+    auto it_in = aie::begin_restrict_vector<16>((bfloat16 *)a);
+    auto it_out = aie::begin_restrict_vector<16>((bfloat16 *)c);
+
+    // Broadcast alpha to a vector
+    vector<bfloat16, 16> alpha_vec = aie::broadcast<bfloat16, 16>(alpha);
+
+    AIE_PREPARE_FOR_PIPELINING
+    AIE_LOOP_MIN_ITERATION_COUNT(16)
+    for (int i = 0; i < vector_size; i += 16) {
+        vector<bfloat16, 16> input = *it_in++;
+        // Leaky RELU: f(x) = max(x, alpha * x) where alpha is typically 0.01
+        // When alpha < 1: if x > 0 then x, else alpha * x
+        vector<bfloat16, 16> alpha_times_input = aie::mul(input, alpha_vec);
+        vector<bfloat16, 16> output = aie::max(input, alpha_times_input);
+        *it_out++ = output;
+    }
+
+    event1();
+
+    return;
+}
+
+extern "C" {
+
+void leaky_relu_bf16(bfloat16 *restrict input, bfloat16 *restrict output, int input_size, bfloat16 alpha)
+{
+    leaky_relu_vectorized_bf16(input, output, input_size, alpha);
+}
+
+} // extern "C"
@@ -50,7 +50,7 @@ def my_leaky_relu(
     leaky_relu_fcn = Kernel(
         "leaky_relu_bf16",
         "leaky_relu.o",
-        [line_type, line_type, np.int32, np.dtype[xfr_dtype]],
+        [line_type, line_type, np.int32, xfr_dtype],
     )
 
     # Task for the core to perform

@@ -25,7 +25,6 @@ def get_params():
 @pytest.mark.parametrize(
     "input_length,num_aie_columns,num_channels,tile_size,alpha", get_params()
 )
-@pytest.mark.skip(reason="Leaky ReLU is currently broken (#36)")
 @pytest.mark.metrics(
     Latency=r"Latency \(us\): (?P<value>[\d\.]+)",
     Bandwidth=r"Effective Bandwidth: (?P<value>[\d\.e\+-]+) GB/s",