InfiniTensor · MoringLotus · Dec 21, 2025 · Dec 21, 2025 · Dec 24, 2025 · Dec 25, 2025
diff --git a/include/infinicore/ops/bi_attention.hpp b/include/infinicore/ops/bi_attention.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class BiAttention {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, size_t);
+    static void execute(Tensor out, Tensor q, Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, size_t pos);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor bi_attention(Tensor q, Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, size_t pos);
+void bi_attention_(Tensor out, Tensor q, Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, size_t pos);
+} // namespace infinicore::op
diff --git a/include/infiniop.h b/include/infiniop.h
@@ -30,6 +30,7 @@
 #include "infiniop/ops/topkrouter.h"
 #include "infiniop/ops/topksoftmax.h"
 #include "infiniop/ops/zeros.h"
+#include "infiniop/ops/bi_attention.h"
 #include "infiniop/tensor_descriptor.h"
 
 #endif // __INFINIOP_API_H__
diff --git a/include/infiniop/ops/bi_attention.h b/include/infiniop/ops/bi_attention.h
@@ -0,0 +1,34 @@
+#ifndef __INFINIOP_BI_ATTENTION_API_H__
+#define __INFINIOP_BI_ATTENTION_API_H__
+
+#include "../operator_descriptor.h"
+#include "gemm.h"
+#include "swiglu.h"
+
+typedef struct InfiniopDescriptor *infiniopBiAttentionDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateBiAttentionDescriptor(infiniopHandle_t handle,
+                                                              infiniopBiAttentionDescriptor_t *desc_ptr,
+                                                              infiniopTensorDescriptor_t out_desc,
+                                                              infiniopTensorDescriptor_t q_desc,
+                                                              infiniopTensorDescriptor_t k_desc,
+                                                              infiniopTensorDescriptor_t v_desc,
+                                                              infiniopTensorDescriptor_t k_cache_desc,
+                                                              infiniopTensorDescriptor_t v_cache_desc,
+                                                              size_t pos);
+
+__C __export infiniStatus_t infiniopGetBiAttentionWorkspaceSize(infiniopBiAttentionDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopBiAttention(infiniopBiAttentionDescriptor_t desc,
+                                              void *workspace,
+                                              size_t workspace_size,
+                                              void *out,
+                                              const void *q,
+                                              const void *k,
+                                              const void *v,
+                                              void *k_cache,
+                                              void *v_cache,
+                                              void *stream);
+
+__C __export infiniStatus_t infiniopDestroyBiAttentionDescriptor(infiniopBiAttentionDescriptor_t desc);
+#endif
diff --git a/python/infinicore/ops/bi_attention.py b/python/infinicore/ops/bi_attention.py
@@ -0,0 +1,28 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def bi_attention(q, k, v, k_cache, v_cache, pos, *, out=None):
+    if out is None:
+        return Tensor(
+            _infinicore.attention(
+                q._underlying,
+                k._underlying,
+                v._underlying,
+                k_cache._underlying,
+                v_cache._underlying,
+                pos,
+            )
+        )
+
+    _infinicore.bi_attention_(
+        out._underlying,
+        q._underlying,
+        k._underlying,
+        v._underlying,
+        k_cache._underlying,
+        v_cache._underlying,
+        pos,
+    )
+
+    return out
diff --git a/src/infinicore/ops/bi_attention/bi_attention.cc b/src/infinicore/ops/bi_attention/bi_attention.cc
@@ -0,0 +1,31 @@
+#include "infinicore/ops/bi_attention.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<BiAttention::schema> &BiAttention::dispatcher() {
+    static common::OpDispatcher<BiAttention::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void BiAttention::execute(Tensor out, Tensor q, Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, size_t pos) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, q, k, v, k_cache, v_cache);
+    infinicore::context::setDevice(out->device());
+    dispatcher().lookup(out->device().getType())(out, q, k, v, k_cache, v_cache, pos);
+}
+
+Tensor bi_attention(Tensor q, Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, size_t pos) {
+    size_t n_q_head = q->shape()[0];
+    size_t seq_len = q->shape()[1];
+    size_t head_dim = q->shape()[2];
+    Shape shape = {seq_len, n_q_head, head_dim};
+    auto out = Tensor::empty(shape, q->dtype(), q->device());
+    bi_attention_(out, q, k, v, k_cache, v_cache, pos);
+    return out;
+}
+
+void bi_attention_(Tensor out, Tensor q, Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, size_t pos) {
+    BiAttention::execute(out, q, k, v, k_cache, v_cache, pos);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/bi_attention/bi_attention_infiniop.cc b/src/infinicore/ops/bi_attention/bi_attention_infiniop.cc
@@ -0,0 +1,52 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/bi_attention.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::bi_attention_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopBiAttentionDescriptor_t> caches(
+    100, // capacity
+    [](infiniopBiAttentionDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyBiAttentionDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor out, Tensor q, Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, size_t pos) {
+    size_t seed = hash_combine(out, q, k, v, k_cache, v_cache, pos);
+
+    auto device = context::getDevice();
+    auto &cache = caches.getCache(device);
+
+    auto desc_opt = cache.get(seed);
+    infiniopBiAttentionDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateBiAttentionDescriptor(
+            context::getInfiniopHandle(device), &desc,
+            out->desc(), q->desc(), k->desc(), v->desc(),
+            k_cache->desc(), v_cache->desc(), pos));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetBiAttentionWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopBiAttention(
+        desc, workspace->data(), workspace_size,
+        out->data(), q->data(), k->data(), v->data(),
+        k_cache->data(), v_cache->data(), context::getStream()));
+}
+
+static bool registered = []() {
+    BiAttention::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::attention_impl::infiniop
diff --git a/src/infinicore/ops/src/infiniop/README.md b/src/infinicore/ops/src/infiniop/README.md
diff --git a/src/infiniop/ops/attention/attention.h b/src/infiniop/ops/attention/attention.h
@@ -1,12 +1,12 @@
-#ifndef ATTENTION_H
-#define ATTENTION_H
+#ifndef BI_ATTENTION_H
+#define BI_ATTENTION_H
 
 #include "../../operator.h"
 #include "info.h"
 
 #define DESCRIPTOR(NAMESPACE)                                    \
                                                                  \
-    namespace op::attention::NAMESPACE {                         \
+    namespace op::bi_attention::NAMESPACE {                         \
     class Descriptor final : public InfiniopDescriptor {         \
         struct Opaque;                                           \
         Opaque *_opaque;                                         \

diff --git a/src/infiniop/ops/bi_attention/bi_attention.h b/src/infiniop/ops/bi_attention/bi_attention.h
@@ -0,0 +1,37 @@
+#ifndef ATTENTION_H
+#define ATTENTION_H
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::attention::NAMESPACE {                         \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            Opaque *opaque,                                      \
+            size_t workspace_size,                               \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _workspace_size(workspace_size) {}                 \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t y_desc,                   \
+            infiniopTensorDescriptor_t x_desc);                  \
+    };                                                           \
+    }
+
+#endif // ATTENTION_H