Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,8 @@ __pycache__/
# txt
*.txt

# check
check_cpp/
check/

*.http
1 change: 1 addition & 0 deletions include/infinicore_infer.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@

#include "infinicore_infer/models/deepseek.h"
#include "infinicore_infer/models/jiuge.h"
#include "infinicore_infer/models/qwen3_vl.h"

#endif /* INFINICORE_INFER_H */
108 changes: 108 additions & 0 deletions include/infinicore_infer/models/qwen3_vl.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
#ifndef MODEL_QWEN3_VL_H
#define MODEL_QWEN3_VL_H

#include <infiniccl.h>
#include <infiniop.h>
#include <infinirt.h>

#include <stdint.h>

#include "../weights_loader.h"

struct Qwen3VLModel;

typedef struct
{
infiniDtype_t dt_logits;
infiniDtype_t dt_linear_w;
infiniDtype_t dt_norm_w;
size_t nlayer, d, nh, nkvh, dh, di, dctx, dvoc;
float epsilon, theta;
uint32_t end_token;
char has_qkv_bias;
char use_qk_norm;
// Vision encoder parameters
size_t vision_hidden_size;
size_t vision_layers;
size_t vision_heads;
size_t patch_size;
size_t img_size;
// Token ids
uint32_t image_token_id;
uint32_t video_token_id;
} Qwen3VLMeta;

//////////////////// APIs ///////////////////////
__C __export struct ModelWeights *
createQwen3VLWeights(const Qwen3VLMeta *,
infiniDevice_t device,
int ndev,
const int *dev_ids);

/// @brief 创建模型
/// @param device 协处理器种类
/// @param ndev 协处理器数量
/// @param dev_ids 协处理器编号,长度为 ndev
__C __export struct Qwen3VLModel *
createQwen3VLModel(const Qwen3VLMeta *,
const ModelWeights *);

/// @brief 销毁模型
__C __export void
destroyQwen3VLModel(struct Qwen3VLModel *);

/// @brief 批次推理一轮,并采样出新的 token
/// @param tokens 输入 token 地址
/// @param ntok 输入 token 数量
/// @param nreq 请求数量
/// @param req_lens 每个请求的 token 数量
/// @param req_pos 每个请求的起始位置
/// @param pos_ids ViT位置编码,格式[patches, 2] (h,w)
/// @param pos_ids_len pos_ids数组长度,应为patches*2
/// @param llm_pos_ids LLM 3D mRoPE位置编码,格式[patches+text_len, 3] (t,h,w)
/// @param llm_pos_ids_len llm_pos_ids数组长度,应为(patches+text_len)*3
/// @param rope_section 3D mRoPE区段配置,格式[3] (t_max,h_max,w_max)
/// @param rope_section_len rope_section数组长度,应为3
/// @param kv_caches 每个请求的 KV Cache
/// @param temperature 采样温度(0. 表示贪心采样)
/// @param topk 采样 topk(1 表示贪心采样)
/// @param topp 采样 topp
/// @param output 输出 token 数组,每个请求一个输出,长度至少为nreq
__C __export void
inferBatchQwen3VL(struct Qwen3VLModel *,
const uint32_t *tokens, uint32_t ntok,
const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
const uint32_t *pos_ids, uint32_t pos_ids_len,
const uint32_t *llm_pos_ids, uint32_t llm_pos_ids_len,
const uint32_t *rope_section, uint32_t rope_section_len,
const float *pixel_values,
struct KVCache **kv_caches,
const float *temperature, const uint32_t *topk, const float *topp,
uint32_t *output);

/// @brief 批次推理一轮,输出 output embedding 后的 logits
/// @param tokens 输入 token 地址
/// @param ntok 输入 token 数量
/// @param nreq 请求数量
/// @param req_lens 每个请求的 token 数量
/// @param req_pos 每个请求的起始位置
/// @param pos_ids ViT位置编码,格式[patches, 2] (h,w)
/// @param pos_ids_len pos_ids数组长度,应为patches*2
/// @param llm_pos_ids LLM 3D mRoPE位置编码,格式[patches+text_len, 3] (t,h,w)
/// @param llm_pos_ids_len llm_pos_ids数组长度,应为(patches+text_len)*3
/// @param rope_section 3D mRoPE区段配置,格式[3] (t_max,h_max,w_max)
/// @param rope_section_len rope_section数组长度,应为3
/// @param kv_caches 每个请求的 KV Cache
/// @param logits 输出 token 数组,每个请求一个输出,长度至少为nreq
__C __export void
forwardBatchQwen3VL(struct Qwen3VLModel *,
const uint32_t *tokens, uint32_t ntok,
const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
const uint32_t *pos_ids, uint32_t pos_ids_len,
const uint32_t *llm_pos_ids, uint32_t llm_pos_ids_len,
const uint32_t *rope_section, uint32_t rope_section_len,
const float *pixel_values,
struct KVCache **kv_caches,
void *logits);

#endif
53 changes: 53 additions & 0 deletions qw3vl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor

# default: Load the model on the available device(s)
model = Qwen3VLForConditionalGeneration.from_pretrained(
"/home/cearx/qy/model/Qwen3-VL-2B-Instruct", dtype="auto", device_map="auto"
)

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
# model = Qwen3VLForConditionalGeneration.from_pretrained(
# "Qwen/Qwen3-VL-2B-Instruct",
# dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
# device_map="auto",
# )

processor = AutoProcessor.from_pretrained(
"/home/cearx/qy/model/Qwen3-VL-2B-Instruct")

# url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
url = "/home/cearx/CLIP/image3.jpg"

messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": url,
},
{"type": "text", "text": "Describe this image."},
],
}
]

# Preparation for inference
inputs = processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt"
)
inputs = inputs.to(model.device)

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)
7 changes: 5 additions & 2 deletions scripts/libinfinicore_infer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
from .base import DataType, DeviceType, KVCacheCStruct
from .base import DataType, DeviceType, KVCacheCStruct, ModelWeightsCStruct
from .jiuge import JiugeModel, JiugeMetaCStruct, JiugeWeightsCStruct
from .jiuge_awq import JiugeAWQModel, JiugeAWQMetaCStruct, ModelWeightsCStruct
from .jiuge_awq import JiugeAWQModel, JiugeAWQMetaCStruct
from .deepseek_v3 import (
DeepSeekV3Model,
DeepSeekV3MetaCStruct,
DeepSeekV3WeightsCStruct,
DeepSeekV3WeightLoaderCStruct,
DeepSeekV3CacheCStruct,
)
from .qwen3_vl import Qwen3VLModel, Qwen3VLMetaCStruct

__all__ = [
"DataType",
Expand All @@ -23,5 +24,7 @@
"DeepSeekV3MetaCStruct",
"DeepSeekV3WeightsCStruct",
"DeepSeekV3WeightLoaderCStruct",
"Qwen3VLModel",
"Qwen3VLMetaCStruct",
"ModelRegister",
]
4 changes: 4 additions & 0 deletions scripts/libinfinicore_infer/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ class KVCacheCStruct(ctypes.Structure):
pass


class ModelWeightsCStruct(ctypes.Structure):
pass


# Model registration system
_model_registry = []

Expand Down
Loading