InfiniTensor · CearX · Jan 11, 2026 · Jan 13, 2026 · Jan 14, 2026
diff --git a/.gitignore b/.gitignore
@@ -28,4 +28,8 @@ __pycache__/
 # txt
 *.txt
 
+# check
+check_cpp/
+check/
+
 *.http
diff --git a/include/infinicore_infer.h b/include/infinicore_infer.h
@@ -6,5 +6,6 @@
 
 #include "infinicore_infer/models/deepseek.h"
 #include "infinicore_infer/models/jiuge.h"
+#include "infinicore_infer/models/qwen3_vl.h"
 
 #endif /* INFINICORE_INFER_H */
diff --git a/include/infinicore_infer/models/qwen3_vl.h b/include/infinicore_infer/models/qwen3_vl.h
@@ -0,0 +1,108 @@
+#ifndef MODEL_QWEN3_VL_H
+#define MODEL_QWEN3_VL_H
+
+#include <infiniccl.h>
+#include <infiniop.h>
+#include <infinirt.h>
+
+#include <stdint.h>
+
+#include "../weights_loader.h"
+
+struct Qwen3VLModel;
+
+typedef struct
+{
+    infiniDtype_t dt_logits;
+    infiniDtype_t dt_linear_w;
+    infiniDtype_t dt_norm_w;
+    size_t nlayer, d, nh, nkvh, dh, di, dctx, dvoc;
+    float epsilon, theta;
+    uint32_t end_token;
+    char has_qkv_bias;
+    char use_qk_norm;
+    // Vision encoder parameters
+    size_t vision_hidden_size;
+    size_t vision_layers;
+    size_t vision_heads;
+    size_t patch_size;
+    size_t img_size;
+    // Token ids
+    uint32_t image_token_id;
+    uint32_t video_token_id;
+} Qwen3VLMeta;
+
+//////////////////// APIs ///////////////////////
+__C __export struct ModelWeights *
+createQwen3VLWeights(const Qwen3VLMeta *,
+                     infiniDevice_t device,
+                     int ndev,
+                     const int *dev_ids);
+
+/// @brief 创建模型
+/// @param device 协处理器种类
+/// @param ndev 协处理器数量
+/// @param dev_ids 协处理器编号，长度为 ndev
+__C __export struct Qwen3VLModel *
+createQwen3VLModel(const Qwen3VLMeta *,
+                   const ModelWeights *);
+
+/// @brief 销毁模型
+__C __export void
+destroyQwen3VLModel(struct Qwen3VLModel *);
+
+/// @brief 批次推理一轮，并采样出新的 token
+/// @param tokens 输入 token 地址
+/// @param ntok 输入 token 数量
+/// @param nreq 请求数量
+/// @param req_lens 每个请求的 token 数量
+/// @param req_pos 每个请求的起始位置
+/// @param pos_ids ViT位置编码，格式[patches, 2] (h,w)
+/// @param pos_ids_len pos_ids数组长度，应为patches*2
+/// @param llm_pos_ids LLM 3D mRoPE位置编码，格式[patches+text_len, 3] (t,h,w)
+/// @param llm_pos_ids_len llm_pos_ids数组长度，应为(patches+text_len)*3
+/// @param rope_section 3D mRoPE区段配置，格式[3] (t_max,h_max,w_max)
+/// @param rope_section_len rope_section数组长度，应为3
+/// @param kv_caches 每个请求的 KV Cache
+/// @param temperature 采样温度（0. 表示贪心采样）
+/// @param topk 采样 topk（1 表示贪心采样）
+/// @param topp 采样 topp
+/// @param output 输出 token 数组，每个请求一个输出，长度至少为nreq
+__C __export void
+inferBatchQwen3VL(struct Qwen3VLModel *,
+                  const uint32_t *tokens, uint32_t ntok,
+                  const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
+                  const uint32_t *pos_ids, uint32_t pos_ids_len,
+                  const uint32_t *llm_pos_ids, uint32_t llm_pos_ids_len,
+                  const uint32_t *rope_section, uint32_t rope_section_len,
+                  const float *pixel_values,
+                  struct KVCache **kv_caches,
+                  const float *temperature, const uint32_t *topk, const float *topp,
+                  uint32_t *output);
+
+/// @brief 批次推理一轮，输出 output embedding 后的 logits
+/// @param tokens 输入 token 地址
+/// @param ntok 输入 token 数量
+/// @param nreq 请求数量
+/// @param req_lens 每个请求的 token 数量
+/// @param req_pos 每个请求的起始位置
+/// @param pos_ids ViT位置编码，格式[patches, 2] (h,w)
+/// @param pos_ids_len pos_ids数组长度，应为patches*2
+/// @param llm_pos_ids LLM 3D mRoPE位置编码，格式[patches+text_len, 3] (t,h,w)
+/// @param llm_pos_ids_len llm_pos_ids数组长度，应为(patches+text_len)*3
+/// @param rope_section 3D mRoPE区段配置，格式[3] (t_max,h_max,w_max)
+/// @param rope_section_len rope_section数组长度，应为3
+/// @param kv_caches 每个请求的 KV Cache
+/// @param logits 输出 token 数组，每个请求一个输出，长度至少为nreq
+__C __export void
+forwardBatchQwen3VL(struct Qwen3VLModel *,
+                    const uint32_t *tokens, uint32_t ntok,
+                    const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
+                    const uint32_t *pos_ids, uint32_t pos_ids_len,
+                    const uint32_t *llm_pos_ids, uint32_t llm_pos_ids_len,
+                    const uint32_t *rope_section, uint32_t rope_section_len,
+                    const float *pixel_values,
+                    struct KVCache **kv_caches,
+                    void *logits);
+
+#endif
diff --git a/qw3vl.py b/qw3vl.py
@@ -0,0 +1,53 @@
+from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
+
+# default: Load the model on the available device(s)
+model = Qwen3VLForConditionalGeneration.from_pretrained(
+    "/home/cearx/qy/model/Qwen3-VL-2B-Instruct", dtype="auto", device_map="auto"
+)
+
+# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
+# model = Qwen3VLForConditionalGeneration.from_pretrained(
+#     "Qwen/Qwen3-VL-2B-Instruct",
+#     dtype=torch.bfloat16,
+#     attn_implementation="flash_attention_2",
+#     device_map="auto",
+# )
+
+processor = AutoProcessor.from_pretrained(
+    "/home/cearx/qy/model/Qwen3-VL-2B-Instruct")
+
+# url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
+url = "/home/cearx/CLIP/image3.jpg"
+
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": url,
+            },
+            {"type": "text", "text": "Describe this image."},
+        ],
+    }
+]
+
+# Preparation for inference
+inputs = processor.apply_chat_template(
+    messages,
+    tokenize=True,
+    add_generation_prompt=True,
+    return_dict=True,
+    return_tensors="pt"
+)
+inputs = inputs.to(model.device)
+
+# Inference: Generation of the output
+generated_ids = model.generate(**inputs, max_new_tokens=128)
+generated_ids_trimmed = [
+    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+]
+output_text = processor.batch_decode(
+    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+)
+print(output_text)
diff --git a/scripts/libinfinicore_infer/__init__.py b/scripts/libinfinicore_infer/__init__.py
@@ -1,13 +1,14 @@
-from .base import DataType, DeviceType, KVCacheCStruct
+from .base import DataType, DeviceType, KVCacheCStruct, ModelWeightsCStruct
 from .jiuge import JiugeModel, JiugeMetaCStruct, JiugeWeightsCStruct
-from .jiuge_awq import JiugeAWQModel, JiugeAWQMetaCStruct, ModelWeightsCStruct
+from .jiuge_awq import JiugeAWQModel, JiugeAWQMetaCStruct
 from .deepseek_v3 import (
     DeepSeekV3Model,
     DeepSeekV3MetaCStruct,
     DeepSeekV3WeightsCStruct,
     DeepSeekV3WeightLoaderCStruct,
     DeepSeekV3CacheCStruct,
 )
+from .qwen3_vl import Qwen3VLModel, Qwen3VLMetaCStruct
 
 __all__ = [
     "DataType",
@@ -23,5 +24,7 @@
     "DeepSeekV3MetaCStruct",
     "DeepSeekV3WeightsCStruct",
     "DeepSeekV3WeightLoaderCStruct",
+    "Qwen3VLModel",
+    "Qwen3VLMetaCStruct",
     "ModelRegister",
 ]
diff --git a/scripts/libinfinicore_infer/base.py b/scripts/libinfinicore_infer/base.py
@@ -43,6 +43,10 @@ class KVCacheCStruct(ctypes.Structure):
     pass
 
 
+class ModelWeightsCStruct(ctypes.Structure):
+    pass
+
+
 # Model registration system
 _model_registry = []
-Original file line number
+Diff line change
@@ Expand Up / @@ -28,4 +28,8 @@ __pycache__/ @@
     # txt
     *.txt
+    # check
+    check_cpp/
+    check/
     *.http