From fe65549d3f481ce32745ea628791e7aebf9a7dbc Mon Sep 17 00:00:00 2001 From: cearX Date: Sun, 11 Jan 2026 22:16:42 +0800 Subject: [PATCH 1/3] init --- include/infinicore_infer.h | 1 + include/infinicore_infer/models/qwen3_vl.h | 108 ++ scripts/libinfinicore_infer/__init__.py | 7 +- scripts/libinfinicore_infer/base.py | 4 + scripts/libinfinicore_infer/qwen3_vl.py | 203 ++++ scripts/qwen3vl.py | 900 ++++++++++++++++ src/cache_manager/opcache_manager.hpp | 15 + src/models/inference_context.cpp | 156 +++ src/models/inference_context.hpp | 62 ++ src/models/qwen3_vl/qwen3_vl.cpp | 1093 ++++++++++++++++++++ src/models/qwen3_vl/qwen3_vl.hpp | 110 ++ src/models/qwen3_vl/qwen3_vl_weight.cpp | 370 +++++++ 12 files changed, 3027 insertions(+), 2 deletions(-) create mode 100644 include/infinicore_infer/models/qwen3_vl.h create mode 100644 scripts/libinfinicore_infer/qwen3_vl.py create mode 100644 scripts/qwen3vl.py create mode 100644 src/models/qwen3_vl/qwen3_vl.cpp create mode 100644 src/models/qwen3_vl/qwen3_vl.hpp create mode 100644 src/models/qwen3_vl/qwen3_vl_weight.cpp diff --git a/include/infinicore_infer.h b/include/infinicore_infer.h index 0bed7bc7..9641c3e3 100644 --- a/include/infinicore_infer.h +++ b/include/infinicore_infer.h @@ -6,5 +6,6 @@ #include "infinicore_infer/models/deepseek.h" #include "infinicore_infer/models/jiuge.h" +#include "infinicore_infer/models/qwen3_vl.h" #endif /* INFINICORE_INFER_H */ diff --git a/include/infinicore_infer/models/qwen3_vl.h b/include/infinicore_infer/models/qwen3_vl.h new file mode 100644 index 00000000..eafa5a5e --- /dev/null +++ b/include/infinicore_infer/models/qwen3_vl.h @@ -0,0 +1,108 @@ +#ifndef MODEL_QWEN3_VL_H +#define MODEL_QWEN3_VL_H + +#include +#include +#include + +#include + +#include "../weights_loader.h" + +struct Qwen3VLModel; + +typedef struct +{ + infiniDtype_t dt_logits; + infiniDtype_t dt_linear_w; + infiniDtype_t dt_norm_w; + size_t nlayer, d, nh, nkvh, dh, di, dctx, dvoc; + float epsilon, theta; + uint32_t end_token; + char has_qkv_bias; + char use_qk_norm; + // Vision encoder parameters + size_t vision_hidden_size; + size_t vision_layers; + size_t vision_heads; + size_t patch_size; + size_t img_size; + // Token ids + uint32_t image_token_id; + uint32_t video_token_id; +} Qwen3VLMeta; + +//////////////////// APIs /////////////////////// +__C __export struct ModelWeights * +createQwen3VLWeights(const Qwen3VLMeta *, + infiniDevice_t device, + int ndev, + const int *dev_ids); + +/// @brief 创建模型 +/// @param device 协处理器种类 +/// @param ndev 协处理器数量 +/// @param dev_ids 协处理器编号,长度为 ndev +__C __export struct Qwen3VLModel * +createQwen3VLModel(const Qwen3VLMeta *, + const ModelWeights *); + +/// @brief 销毁模型 +__C __export void +destroyQwen3VLModel(struct Qwen3VLModel *); + +/// @brief 批次推理一轮,并采样出新的 token +/// @param tokens 输入 token 地址 +/// @param ntok 输入 token 数量 +/// @param nreq 请求数量 +/// @param req_lens 每个请求的 token 数量 +/// @param req_pos 每个请求的起始位置 +/// @param pos_ids ViT位置编码,格式[patches, 2] (h,w) +/// @param pos_ids_len pos_ids数组长度,应为patches*2 +/// @param llm_pos_ids LLM 3D mRoPE位置编码,格式[patches+text_len, 3] (t,h,w) +/// @param llm_pos_ids_len llm_pos_ids数组长度,应为(patches+text_len)*3 +/// @param rope_section 3D mRoPE区段配置,格式[3] (t_max,h_max,w_max) +/// @param rope_section_len rope_section数组长度,应为3 +/// @param kv_caches 每个请求的 KV Cache +/// @param temperature 采样温度(0. 表示贪心采样) +/// @param topk 采样 topk(1 表示贪心采样) +/// @param topp 采样 topp +/// @param output 输出 token 数组,每个请求一个输出,长度至少为nreq +__C __export void +inferBatchQwen3VL(struct Qwen3VLModel *, + const uint32_t *tokens, uint32_t ntok, + const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos, + const uint32_t *pos_ids, uint32_t pos_ids_len, + const uint32_t *llm_pos_ids, uint32_t llm_pos_ids_len, + const uint32_t *rope_section, uint32_t rope_section_len, + const float *pixel_values, + struct KVCache **kv_caches, + const float *temperature, const uint32_t *topk, const float *topp, + uint32_t *output); + +/// @brief 批次推理一轮,输出 output embedding 后的 logits +/// @param tokens 输入 token 地址 +/// @param ntok 输入 token 数量 +/// @param nreq 请求数量 +/// @param req_lens 每个请求的 token 数量 +/// @param req_pos 每个请求的起始位置 +/// @param pos_ids ViT位置编码,格式[patches, 2] (h,w) +/// @param pos_ids_len pos_ids数组长度,应为patches*2 +/// @param llm_pos_ids LLM 3D mRoPE位置编码,格式[patches+text_len, 3] (t,h,w) +/// @param llm_pos_ids_len llm_pos_ids数组长度,应为(patches+text_len)*3 +/// @param rope_section 3D mRoPE区段配置,格式[3] (t_max,h_max,w_max) +/// @param rope_section_len rope_section数组长度,应为3 +/// @param kv_caches 每个请求的 KV Cache +/// @param logits 输出 token 数组,每个请求一个输出,长度至少为nreq +__C __export void +forwardBatchQwen3VL(struct Qwen3VLModel *, + const uint32_t *tokens, uint32_t ntok, + const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos, + const uint32_t *pos_ids, uint32_t pos_ids_len, + const uint32_t *llm_pos_ids, uint32_t llm_pos_ids_len, + const uint32_t *rope_section, uint32_t rope_section_len, + const float *pixel_values, + struct KVCache **kv_caches, + void *logits); + +#endif diff --git a/scripts/libinfinicore_infer/__init__.py b/scripts/libinfinicore_infer/__init__.py index 8fc5f4db..a61b3128 100644 --- a/scripts/libinfinicore_infer/__init__.py +++ b/scripts/libinfinicore_infer/__init__.py @@ -1,6 +1,6 @@ -from .base import DataType, DeviceType, KVCacheCStruct +from .base import DataType, DeviceType, KVCacheCStruct, ModelWeightsCStruct from .jiuge import JiugeModel, JiugeMetaCStruct, JiugeWeightsCStruct -from .jiuge_awq import JiugeAWQModel, JiugeAWQMetaCStruct, ModelWeightsCStruct +from .jiuge_awq import JiugeAWQModel, JiugeAWQMetaCStruct from .deepseek_v3 import ( DeepSeekV3Model, DeepSeekV3MetaCStruct, @@ -8,6 +8,7 @@ DeepSeekV3WeightLoaderCStruct, DeepSeekV3CacheCStruct, ) +from .qwen3_vl import Qwen3VLModel, Qwen3VLMetaCStruct __all__ = [ "DataType", @@ -23,5 +24,7 @@ "DeepSeekV3MetaCStruct", "DeepSeekV3WeightsCStruct", "DeepSeekV3WeightLoaderCStruct", + "Qwen3VLModel", + "Qwen3VLMetaCStruct", "ModelRegister", ] diff --git a/scripts/libinfinicore_infer/base.py b/scripts/libinfinicore_infer/base.py index 3305cdba..85c343cf 100644 --- a/scripts/libinfinicore_infer/base.py +++ b/scripts/libinfinicore_infer/base.py @@ -43,6 +43,10 @@ class KVCacheCStruct(ctypes.Structure): pass +class ModelWeightsCStruct(ctypes.Structure): + pass + + # Model registration system _model_registry = [] diff --git a/scripts/libinfinicore_infer/qwen3_vl.py b/scripts/libinfinicore_infer/qwen3_vl.py new file mode 100644 index 00000000..c1fd17a6 --- /dev/null +++ b/scripts/libinfinicore_infer/qwen3_vl.py @@ -0,0 +1,203 @@ +from .base import BaseModel, DataType, DeviceType, KVCacheCStruct, register_model, ModelWeightsCStruct +from ctypes import ( + c_size_t, + c_uint, + c_int, + c_float, + c_void_p, + POINTER, + Structure, + c_char, + c_char_p, +) + + +class Qwen3VLMetaCStruct(Structure): + _fields_ = [ + ("dt_logits", DataType), + ("dt_linear_w", DataType), + ("dt_norm_w", DataType), + ("nlayer", c_size_t), + ("d", c_size_t), + ("nh", c_size_t), + ("nkvh", c_size_t), + ("dh", c_size_t), + ("di", c_size_t), + ("dctx", c_size_t), + ("dvoc", c_size_t), + ("epsilon", c_float), + ("theta", c_float), + ("end_token", c_uint), + ("has_qkv_bias", c_char), + # vision encoder parameters + ("use_qk_norm", c_char), + ("vision_hidden_size", c_size_t), + ("vision_layers", c_size_t), + ("vision_heads", c_size_t), + ("patch_size", c_size_t), + ("img_size", c_size_t), + # token ids + ("image_token_id", c_uint), + ("video_token_id", c_uint), + ] + + +class Qwen3VLModelCStruct(Structure): + pass + + +@register_model +class Qwen3VLModel(BaseModel): + @classmethod + def register_lib(cls, lib): + """Register Qwen3VL model functions with the library""" + lib.createQwen3VLWeights.restype = POINTER(ModelWeightsCStruct) + lib.createQwen3VLWeights.argtypes = [ + POINTER(Qwen3VLMetaCStruct), + DeviceType, + c_int, + POINTER(c_int), + ] + + lib.createQwen3VLModel.restype = POINTER(Qwen3VLModelCStruct) + lib.createQwen3VLModel.argtypes = [ + POINTER(Qwen3VLMetaCStruct), + POINTER(ModelWeightsCStruct), + ] + + lib.destroyQwen3VLModel.argtypes = [POINTER(Qwen3VLModelCStruct)] + + lib.createKVCache.argtypes = [ + c_size_t, + c_size_t, + c_size_t, + c_size_t, + c_size_t, + DataType, + DeviceType, + POINTER(c_int), + c_size_t, + ] + lib.createKVCache.restype = POINTER(KVCacheCStruct) + + lib.dropKVCache.argtypes = [POINTER(KVCacheCStruct)] + + lib.inferBatchQwen3VL.argtypes = [ + POINTER(Qwen3VLModelCStruct), + POINTER(c_uint), + c_uint, + POINTER(c_uint), + c_uint, + POINTER(c_uint), + POINTER(c_uint), + c_uint, + POINTER(c_uint), # llm_pos_ids + c_uint, # llm_pos_ids_len + POINTER(c_uint), # rope_section + c_uint, # rope_section_len + c_void_p, + POINTER(POINTER(KVCacheCStruct)), + POINTER(c_float), + POINTER(c_uint), + POINTER(c_float), + POINTER(c_uint), + ] + + lib.forwardBatchQwen3VL.argtypes = [ + POINTER(Qwen3VLModelCStruct), + POINTER(c_uint), + c_uint, + POINTER(c_uint), + c_uint, + POINTER(c_uint), + POINTER(c_uint), + c_uint, + POINTER(c_uint), # llm_pos_ids + c_uint, # llm_pos_ids_len + POINTER(c_uint), # rope_section + c_uint, # rope_section_len + c_void_p, + POINTER(POINTER(KVCacheCStruct)), + c_void_p, + ] + + lib.loadModelWeight.argtypes = [ + POINTER(ModelWeightsCStruct), + c_char_p, + c_void_p, + ] + + def create_weights(self, meta, device_type, ndev, dev_ids): + return self.lib.createQwen3VLWeights(meta, device_type, ndev, dev_ids) + + def create_model(self, meta, weights): + return self.lib.createQwen3VLModel(meta, weights) + + def destroy_model(self, model): + self.lib.destroyQwen3VLModel(model) + + def create_kv_cache( + self, nlayer, max_len, nkvh, dk, dv, dtype, device, dev_ids, ndev + ): + return self.lib.createKVCache( + nlayer, max_len, nkvh, dk, dv, dtype, device, dev_ids, ndev + ) + + def drop_kv_cache(self, kv_cache): + self.lib.dropKVCache(kv_cache) + + def load_weight(self, weights, name, data): + self.lib.loadModelWeight(weights, name.encode("utf-8"), data) + + def infer_batch( + self, + model, + tokens, + ntok, + req_lens, + nreq, + req_pos, + pos_ids, + pos_ids_len, + llm_pos_ids, + llm_pos_ids_len, + rope_section, + rope_section_len, + pixel_values, + kv_caches, + temperature, + topk, + topp, + output, + ): + self.lib.inferBatchQwen3VL( + model, + tokens, + ntok, + req_lens, + nreq, + req_pos, + pos_ids, + pos_ids_len, + llm_pos_ids, + llm_pos_ids_len, + rope_section, + rope_section_len, + pixel_values, + kv_caches, + temperature, + topk, + topp, + output, + ) + + def forward_batch( + self, model, tokens, ntok, req_lens, nreq, req_pos, pos_ids, pos_ids_len, + llm_pos_ids, llm_pos_ids_len, rope_section, rope_section_len, + pixel_values, kv_caches, logits + ): + self.lib.forwardBatchQwen3VL( + model, tokens, ntok, req_lens, nreq, req_pos, pos_ids, pos_ids_len, + llm_pos_ids, llm_pos_ids_len, rope_section, rope_section_len, + pixel_values, kv_caches, logits + ) diff --git a/scripts/qwen3vl.py b/scripts/qwen3vl.py new file mode 100644 index 00000000..fcefb769 --- /dev/null +++ b/scripts/qwen3vl.py @@ -0,0 +1,900 @@ +from typing import List, Sequence +import math +import os +from pathlib import Path +import safetensors +import sys +import time +import json +import torch +import transformers +import numpy as np +from PIL import Image + +from libinfinicore_infer import ( + Qwen3VLModel, + Qwen3VLMetaCStruct, + DataType, + DeviceType, + KVCacheCStruct, +) +from infer_task import InferTask, KVCache + +from ctypes import POINTER, c_float, c_int, c_uint, c_void_p, byref + +torch.set_default_device("cpu") + + +def smart_resize(height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280): + """基于 transformers 的 smart_resize 实现""" + import math + if max(height, width) / min(height, width) > 200: + raise ValueError(f"aspect ratio too large: {max(height, width) / min(height, width)}") + + h_bar = round(height / factor) * factor + w_bar = round(width / factor) * factor + + if h_bar * w_bar > max_pixels: + beta = math.sqrt((height * width) / max_pixels) + h_bar = max(factor, math.floor(height / beta / factor) * factor) + w_bar = max(factor, math.floor(width / beta / factor) * factor) + elif h_bar * w_bar < min_pixels: + beta = math.sqrt(min_pixels / (height * width)) + h_bar = math.ceil(height * beta / factor) * factor + w_bar = math.ceil(width * beta / factor) * factor + + return h_bar, w_bar + + +def preprocess_image_qwen3vl(image_path: str): + """ + 完整的 Qwen3-VL 图像预处理流程 + 基于 transformers 的实现:加载→resize→rescale→normalize→CHW→reshape→permute→flatten + """ + # 配置参数 (从 Qwen3-VL config 中获取) + patch_size = 16 + merge_size = 2 # spatial_merge_size + temporal_patch_size = 2 + factor = patch_size * merge_size # 28 + min_pixels = 4 * 28 * 28 + max_pixels = 16384 * 28 * 28 + + # 1. 加载图像 + image = Image.open(image_path).convert('RGB') + height, width = image.size[1], image.size[0] # PIL: (width, height) + + # 2. Smart resize (保持宽高比,满足像素数和因子整除要求) + resized_height, resized_width = smart_resize(height, width, factor, min_pixels, max_pixels) + image = image.resize((resized_width, resized_height), Image.BILINEAR) + + print(f"图像预处理: {width}×{height} -> {resized_width}×{resized_height}") + + # 3. 转换为张量 + patches = torch.tensor(np.array(image)).float() + + # 4. Rescale (0-255 -> 0-1) + patches = patches / 255.0 + + # 5. Normalize (ImageNet 标准) + # mean = torch.tensor([0.485, 0.456, 0.406]) + mean = torch.tensor([0.5, 0.5, 0.5]) + # std = torch.tensor([0.229, 0.224, 0.225]) + std = torch.tensor([0.5, 0.5, 0.5]) + patches = (patches - mean) / std + + # 6. CHW 调整: [H, W, C] -> [C, H, W] + patches = patches.permute(2, 0, 1) + + # 7. 添加 batch 和时间维度 [C, H, W] -> [1, C, H, W] (模拟单帧) + patches = patches.unsqueeze(0) + + # 8. Temporal padding (确保帧数能被 temporal_patch_size 整除) + if patches.shape[0] % temporal_patch_size != 0: + repeats = patches[-1:].repeat(temporal_patch_size - patches.shape[0] % temporal_patch_size, 1, 1, 1) + patches = torch.cat([patches, repeats], dim=0) + + # 9. Grid 计算 + channel = patches.shape[1] + grid_t = patches.shape[0] // temporal_patch_size + grid_h, grid_w = resized_height // patch_size, resized_width // patch_size + + # 9. Reshape 和 Permute (按照 transformers 实现) + patches = patches.view( + grid_t, + temporal_patch_size, + channel, + grid_h // merge_size, + merge_size, + patch_size, + grid_w // merge_size, + merge_size, + patch_size, + ) + patches = patches.permute(0, 3, 6, 4, 7, 2, 1, 5, 8) + + # 10. Flatten + flatten_patches = patches.reshape( + grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size + ) + + # Grid_thw (注意:这里是原始的 grid 大小,不是 merge 后的) + grid_thw = torch.tensor([[grid_t, grid_h, grid_w]], dtype=torch.int32) + + return flatten_patches, grid_thw + + +def compute_2d_mrope_pos_ids(grid_thw: torch.Tensor, spatial_merge_size: int = 2): + """ + 计算 2D MRoPE 的 pos_ids,基于 vLLM 的实现 + + Args: + grid_thw: [batch, 3] 张量,包含 [t, h, w] (原始 grid 大小) + spatial_merge_size: 空间合并大小,默认2 + + Returns: + pos_ids: [num_patches, 2] 张量,包含 [h_pos, w_pos] 坐标 + """ + pos_ids_list = [] + + for t, h, w in grid_thw: + t, h, w = int(t), int(h), int(w) + + # 按照 vLLM 的 rot_pos_emb 实现,考虑 spatial_merge_size + # 生成高度位置索引 + hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) + hpos_ids = hpos_ids.reshape( + h // spatial_merge_size, + spatial_merge_size, + w // spatial_merge_size, + spatial_merge_size, + ) + hpos_ids = hpos_ids.permute(0, 2, 1, 3).flatten() + + # 生成宽度位置索引 + wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) + wpos_ids = wpos_ids.reshape( + h // spatial_merge_size, + spatial_merge_size, + w // spatial_merge_size, + spatial_merge_size, + ) + wpos_ids = wpos_ids.permute(0, 2, 1, 3).flatten() + + # 组合坐标并重复时间维度 + pos_ids = torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1) + pos_ids_list.append(pos_ids) + + return torch.cat(pos_ids_list, dim=0) + + +class Qwen3VLMetaFromConfig(Qwen3VLMetaCStruct): + def __init__(self, config, dtype=torch.bfloat16, max_tokens=None): + if dtype == torch.float16: + dt_ = DataType.INFINI_DTYPE_F16 + elif dtype == torch.float32: + dt_ = DataType.INFINI_DTYPE_F32 + elif dtype == torch.bfloat16: + dt_ = DataType.INFINI_DTYPE_BF16 + else: + dt_ = DataType.INFINI_DTYPE_BF16 + + self.scale_input = 1.0 + self.scale_output = 1.0 + self.scale_o = 1.0 + self.scale_down = 1.0 + has_qkv_bias = 0 + # 配置可能在顶层或在 text_config 中 + text_config = config.get("text_config", config) + eos_token_id = text_config.get("eos_token_id") + vision_config = config.get("vision_config", {}) + + super().__init__( + dt_logits=dt_, + dt_linear_w=dt_, + dt_norm_w=dt_, + nlayer=text_config["num_hidden_layers"], + d=text_config["hidden_size"], + nh=text_config["num_attention_heads"], + nkvh=text_config["num_key_value_heads"], + dh=text_config["head_dim"], + di=text_config["intermediate_size"], + dctx=text_config["max_position_embeddings"] if max_tokens is None else max_tokens, + dvoc=text_config["vocab_size"], + epsilon=text_config["rms_norm_eps"], + theta=text_config["rope_theta"], + end_token=eos_token_id, + has_qkv_bias=has_qkv_bias, + # vision encoder parameters + use_qk_norm=1 if text_config.get("use_qk_norm", False) else 0, + vision_hidden_size=vision_config.get("hidden_size", 768), + vision_layers=vision_config.get("depth", 12), + vision_heads=vision_config.get("num_heads", 12), + patch_size=vision_config.get("patch_size", 16), + img_size=vision_config.get("img_size", 768), + image_token_id=int(config.get("image_token_id", 151654)), + video_token_id=int(config.get("video_token_id", 151656)), + ) + self.torch_dtype_logits = dtype + # 保留到python对象上,供上层使用 + try: + self.image_token_id = int(config.get("image_token_id", 151654)) + except Exception: + self.image_token_id = 151654 + try: + self.video_token_id = int(config.get("video_token_id", 151656)) + except Exception: + self.video_token_id = 151656 + + +class Qwen3VLBatchedTask: + def __init__(self, tasks: List[InferTask], image_path: str | None = None, video_path: str | None = None, config: dict | None = None): + self.tasks = tasks + self.nreq = len(tasks) + + # Precompute fields + token_lists = [t.tokens for t in tasks] + self.req_lens_list = [len(toks) for toks in token_lists] + self.req_pos_list = [t.pos for t in tasks] + self.kv_cache_ptrs = [t.kvcache().data() for t in tasks] + self.temperaturas_list = [t.temperature for t in tasks] + self.topks_list = [t.topk for t in tasks] + self.topps_list = [t.topp for t in tasks] + + # Flatten token lists - 对于 ViT,tokens 实际上是 patch embeddings + flat_tokens = [tok for toks in token_lists for tok in toks] + + # 统一:ntok 始终为文本 token 数;pixel_values 仅在 prefill(首轮,pos==0) 且有图像时提供 + self.ntok = len(flat_tokens) + self.pixel_values = None + self.num_patches = 0 + self.patch_dim = 0 + self.grid_thw = None + self.image_path = image_path + self.video_path = video_path + # 从config中读取image/video token id(若存在) + self.image_token_id = None + self.video_token_id = None + if isinstance(config, dict): + self.image_token_id = config.get("image_token_id", None) + self.video_token_id = config.get("video_token_id", None) + # prefill 判断:仅当该 batch 中存在 pos==0 的请求且包含图像占位符时,计算像素与pos_ids + any_prefill_with_image = False + def is_image_tok(tid: int) -> bool: + if self.image_token_id is not None: + try: + return tid == int(self.image_token_id) + except Exception: + pass + return 151652 <= tid <= 151656 + def is_video_tok(tid: int) -> bool: + if self.video_token_id is not None: + try: + return tid == int(self.video_token_id) + except Exception: + pass + return tid == 151656 + + for task in tasks: + # print(f"[DEBUG] Task pos={task.pos}, tokens={task.tokens}") + has_image_token = any(is_image_tok(token) or is_video_tok(token) for token in task.tokens) + # print(f"[DEBUG] Has image/video token: {has_image_token}") + if task.pos == 0 and has_image_token: + any_prefill_with_image = True + break + # print(f"[DEBUG] any_prefill_with_image: {any_prefill_with_image}") + # print(f"[DEBUG] image_path: {self.image_path}") + if any_prefill_with_image: + try: + src_path = self.image_path if self.image_path is not None else self.video_path + if src_path is None: + raise RuntimeError("no image/video path provided for prefill with vision input") + # print(f"[DEBUG] Processing image: {src_path}") + self.pixel_values, self.grid_thw = preprocess_image_qwen3vl(src_path) + self.num_patches = self.pixel_values.shape[0] # 设置 patch 数量 + self.patch_dim = self.pixel_values.shape[1] + # print(f"[DEBUG] Pixel values shape: {self.pixel_values.shape}") + # print(f"[DEBUG] Grid THW: {self.grid_thw}") + # print(f"[DEBUG] Number of patches: {self.num_patches}") + except Exception as _e: + self.pixel_values = None + self.grid_thw = None + self.num_patches = 0 + self.patch_dim = 0 + + # 实现 2D MRoPE pos_ids 计算 + # 集成图像 pos_ids 到批处理中 + flat_pos_ids = [] + self.has_vision = False # 默认无视觉输入 + self.vision_pos_shape = None + + if any_prefill_with_image and getattr(self, 'pixel_values', None) is not None and getattr(self, 'grid_thw', None) is not None: + try: + pos_ids = compute_2d_mrope_pos_ids(self.grid_thw) + for pos in pos_ids: + flat_pos_ids.extend([int(pos[0].item()), int(pos[1].item())]) + self.has_vision = True + self.vision_pos_shape = pos_ids.shape + except Exception as e: + print(f"警告: 图像 pos_ids 计算失败,prefill 将降级为纯文本: {e}") + self.has_vision = False + else: + # 纯文本或decode:为每个token提供简化pos_ids,避免C端空指针 + for toks in token_lists: + for i in range(len(toks)): + flat_pos_ids.extend([i, 0]) + self.has_vision = False + + # Convert to ctypes arrays in one pass + self.tokens = (c_uint * self.ntok)(*flat_tokens) + self.req_lens = (c_uint * self.nreq)(*self.req_lens_list) + self.req_pos = (c_uint * self.nreq)(*self.req_pos_list) + # 确保 flat_pos_ids 都是整数并构造 ctypes 数组 + safe_pos_ids = [int(x) for x in flat_pos_ids] if flat_pos_ids else [0] + self.pos_ids = (c_uint * len(safe_pos_ids))(*safe_pos_ids) + self.pos_ids_len = len(safe_pos_ids) + self.kv_caches = (POINTER(KVCacheCStruct) * + self.nreq)(*self.kv_cache_ptrs) + self.temperaturas = (c_float * self.nreq)(*self.temperaturas_list) + self.topks = (c_uint * self.nreq)(*self.topks_list) + self.topps = (c_float * self.nreq)(*self.topps_list) + + # 构造 3D mRoPE 参数 + self.llm_pos_ids = None + self.llm_pos_ids_len = 0 + self.rope_section = None + self.rope_section_len = 0 + + # 构造 deepstack_layers 参数 + vision_config = getattr(config, 'vision_config', {}) + deepstack_visual_indexes = getattr(vision_config, 'deepstack_visual_multiscale_indexes', [3, 6, 9]) + self.deepstack_layers = (c_uint * len(deepstack_visual_indexes))(*deepstack_visual_indexes) + self.deepstack_layers_len = len(deepstack_visual_indexes) + + if self.has_vision and hasattr(self, 'num_patches') and self.num_patches > 0: + # 构造 3D MRoPE pos_ids,考虑token替换:ntok-1+num_patches + # 基于Rust代码的逻辑:pre_text + vision + post_text + + # 计算image token的位置(假设只有一个image token) + image_token_id = 151655 # <|image_pad|> token + image_token_pos = -1 + for i, token in enumerate(self.tokens): + if token == image_token_id: + image_token_pos = i + break + + if image_token_pos == -1: + # 如果没找到image token,按原逻辑处理 + pre_text_len = 0 + post_text_len = self.ntok + else: + pre_text_len = image_token_pos # image token之前的文本 + post_text_len = self.ntok - image_token_pos - 1 # image token之后的文本 + + # print(f"[DEBUG] 3D pos_ids: pre_text_len={pre_text_len}, vision_len={self.num_patches}, post_text_len={post_text_len}") + + total_len = pre_text_len + self.num_patches + post_text_len + llm_pos_ids_flat = [] + + # 图像前文本:每个维度都是连续递增 + for i in range(pre_text_len): + llm_pos_ids_flat.extend([i, i, i]) + + # 视觉部分:参考Rust代码的3D位置计算 + img_start_pos = pre_text_len + # 简化处理:假设t=1, h=20, w=30 (对应600个patches) + t_len, h_len, w_len = 1, 20, 30 + for t in range(t_len): + for h in range(h_len): + for w in range(w_len): + t_pos = img_start_pos + t + h_pos = img_start_pos + h + w_pos = img_start_pos + w + llm_pos_ids_flat.extend([t_pos, h_pos, w_pos]) + + # 图像后文本:从视觉最大位置+1开始 + vision_max_pos = max(img_start_pos + t_len - 1, + img_start_pos + h_len - 1, + img_start_pos + w_len - 1) + text_start_pos = vision_max_pos + 1 + for i in range(post_text_len): + pos_val = text_start_pos + i + llm_pos_ids_flat.extend([pos_val, pos_val, pos_val]) + + self.llm_pos_ids_len = len(llm_pos_ids_flat) + self.llm_pos_ids = (c_uint * self.llm_pos_ids_len)(*llm_pos_ids_flat) + # print(f"[DEBUG] 构造的3D pos_ids长度: {self.llm_pos_ids_len//3}, 总长度: {self.llm_pos_ids_len}") + + # 构造 rope_section [3] = [t_max, h_max, w_max] + # 从config读取,默认为[24, 20, 20] + rope_section_vals = getattr(config, 'rope_scaling', {}).get('mrope_section', [24, 20, 20]) + self.rope_section_len = 3 + self.rope_section = (c_uint * 3)(*rope_section_vals) + + def input_args(self): + # pixel_values 作为裸指针传递;无视觉输入则传空指针 + if getattr(self, 'pixel_values', None) is not None: + # 确保是连续内存 + pv = self.pixel_values.contiguous() + pixel_values_ptr = c_void_p(int(pv.data_ptr())) + else: + pixel_values_ptr = c_void_p(0) + + # 处理3D mRoPE参数的空指针 + llm_pos_ids_ptr = self.llm_pos_ids if self.llm_pos_ids is not None else POINTER(c_uint)() + rope_section_ptr = self.rope_section if self.rope_section is not None else POINTER(c_uint)() + + return ( + self.tokens, + self.ntok, + self.req_lens, + self.nreq, + self.req_pos, + self.pos_ids, + self.pos_ids_len, + llm_pos_ids_ptr, + self.llm_pos_ids_len, + rope_section_ptr, + self.rope_section_len, + pixel_values_ptr, + self.kv_caches, + self.temperaturas, + self.topks, + self.topps, + ) + + def get_vision_info(self): + """获取视觉相关的信息,供 C++ 端使用""" + return { + 'has_vision': getattr(self, 'has_vision', False), + 'vision_pos_shape': getattr(self, 'vision_pos_shape', None), + 'pos_ids_should_be_2d': getattr(self, 'has_vision', False) + } + + +class Qwen3VLForCausalLM: + def __init__(self, model_dir_path, device=DeviceType.DEVICE_TYPE_CPU, ndev=1, max_tokens=None): + load_start_time = time.time() + print(f"Creating model on {ndev} devices...") + with open(os.path.join(model_dir_path, "config.json"), "r") as f: + config = json.load(f) + self.config = config + # eos_token_id 可能在顶层或在 text_config 中 + eos_token_id = self.config.get("eos_token_id") or self.config.get("text_config", {}).get("eos_token_id") + self.eos_token_id = ( + [eos_token_id] if type(eos_token_id) == int else eos_token_id + ) + self.dev_ids = (c_int * ndev)(*[i for i in range(ndev)]) + self.ndev = ndev + self.device = device + self.meta = Qwen3VLMetaFromConfig(config, max_tokens=max_tokens) + + self.qwen3vl_model = Qwen3VLModel() + + self.weights = self.qwen3vl_model.create_weights( + byref(self.meta), + self.device, + ndev, + self.dev_ids, + ) + self.tokenizer = transformers.AutoTokenizer.from_pretrained( + model_dir_path, trust_remote_code=True + ) + + load_end_time = time.time() + print(f"Time used: {load_end_time - load_start_time:.3f}s") + + load_start_time = time.time() + print("Loading model weights to host...") + + self.load_all_safetensors_from_dir(os.path.join(model_dir_path)) + + self.model_instance = self.qwen3vl_model.create_model( + byref(self.meta), + self.weights, + ) + load_end_time = time.time() + print(f"Time used: {load_end_time - load_start_time:.3f}s") + + def load_all_safetensors_from_dir(self, dir_path_: str): + dir_path_ = Path(dir_path_) + total_keys = 0 + + # 创建检查文件夹 + check_dir = Path("./check") + check_dir.mkdir(exist_ok=True) + + for file in sorted(dir_path_.glob("*.safetensors")): + with safetensors.safe_open(file, framework="pt", device="cpu") as f: + for key in f.keys(): + total_keys += 1 + + tensor = f.get_tensor(key) + + # 保存张量 + self.save_tensor(key, tensor) + + # if "o_proj.scales" in key: + # tensor = tensor * self.meta.scale_o + # elif "down_proj.scales" in key: + # tensor = tensor * self.meta.scale_down + # elif "embed_tokens.weight" in key: + # tensor = tensor * self.meta.scale_input + # elif "lm_head.weight" in key: + # tensor = tensor * self.meta.scale_output + + self.qwen3vl_model.load_weight( + self.weights, key, tensor.data_ptr() + ) + print(f"加载的张量 key 总数: {total_keys}") + + def save_tensor(self, key: str, tensor, check_dir=None): + if check_dir is None: + check_dir = Path("./check") + + # 创建保存目录 + check_dir.mkdir(exist_ok=True) + + # 根据键名生成文件名 + filename = None + + # 1. Patch Embedding + if key == "visual.patch_embed.proj.weight": + filename = "1.patch_embd_w.txt" + elif key == "visual.patch_embed.proj.bias": + filename = "1.patch_embd_bias.txt" + + # 2. Position Embedding + elif key == "visual.pos_embed.weight": + filename = "2.pos_embd.txt" + + # 3. Block0 相关张量 + elif key == "visual.blocks.0.norm1.weight": + filename = "3.block0.norm1_w.txt" + elif key == "visual.blocks.0.norm1.bias": + filename = "3.block0.norm1.bias.txt" + elif key == "visual.blocks.0.attn.qkv.weight": + filename = "3.block0.attn.qkv_w.txt" + elif key == "visual.blocks.0.attn.qkv.bias": + filename = "3.block0.attn.qkv.bias.txt" + elif key == "visual.blocks.0.attn.proj.weight": + filename = "3.block0.attn.proj_w.txt" + elif key == "visual.blocks.0.attn.proj.bias": + filename = "3.block0.attn.proj.bias.txt" + elif key == "visual.blocks.0.norm2.weight": + filename = "4.block0.norm2_w.txt" + elif key == "visual.blocks.0.norm2.bias": + filename = "4.block0.norm2.bias.txt" + elif key == "visual.blocks.0.mlp.linear_fc1.weight": + filename = "4.block0.mlp.fc1_w.txt" + elif key == "visual.blocks.0.mlp.linear_fc1.bias": + filename = "4.block0.mlp.fc1.bias.txt" + elif key == "visual.blocks.0.mlp.linear_fc2.weight": + filename = "4.block0.mlp.fc2_w.txt" + elif key == "visual.blocks.0.mlp.linear_fc2.bias": + filename = "4.block0.mlp.fc2.bias.txt" + + # 5. Merger + elif key == "visual.merger.norm.weight": + filename = "5.merger.norm_w.txt" + elif key == "visual.merger.norm.bias": + filename = "5.merger.norm.bias.txt" + elif key == "visual.merger.linear_fc1.weight": + filename = "5.merger.fc1_w.txt" + elif key == "visual.merger.linear_fc1.bias": + filename = "5.merger.fc1.bias.txt" + elif key == "visual.merger.linear_fc2.weight": + filename = "5.merger.fc2_w.txt" + elif key == "visual.merger.linear_fc2.bias": + filename = "5.merger.fc2.bias.txt" + + # 兼容原有的merger键名 + elif key == "visual.merger.ln_q.weight": + filename = "5.merger.ln_q_w.txt" + elif key == "visual.merger.ln_q.bias": + filename = "5.merger.ln_q_bias.txt" + elif key == "visual.merger.mlp.0.weight": + filename = "5.merger.mlp0_w.txt" + elif key == "visual.merger.mlp.0.bias": + filename = "5.merger.mlp0_bias.txt" + elif key == "visual.merger.mlp.2.weight": + filename = "5.merger.mlp2_w.txt" + elif key == "visual.merger.mlp.2.bias": + filename = "5.merger.mlp2_bias.txt" + + # 6. Deepstack Merger List (动态匹配) + elif "visual.deepstack_merger_list." in key: + # 提取索引号 + parts = key.split(".") + if len(parts) >= 3: + try: + idx = int(parts[2]) # deepstack_merger_list.{idx}.xxx + suffix = ".".join(parts[3:]) + prefix = f"6.deepstack{idx}" + + if suffix == "norm.weight": + filename = f"{prefix}.norm_w.txt" + elif suffix == "norm.bias": + filename = f"{prefix}.norm.bias.txt" + elif suffix == "linear_fc1.weight": + filename = f"{prefix}.fc1_w.txt" + elif suffix == "linear_fc1.bias": + filename = f"{prefix}.fc1.bias.txt" + elif suffix == "linear_fc2.weight": + filename = f"{prefix}.fc2_w.txt" + elif suffix == "linear_fc2.bias": + filename = f"{prefix}.fc2.bias.txt" + except ValueError: + pass + + # 如果没有匹配的文件名,直接返回 + if filename is None: + return + + filepath = check_dir / filename + + with open(filepath, 'w', encoding='utf-8') as f: + # 写入形状信息 + shape_str = f"Shape: {tuple(tensor.shape)}\n" + f.write(shape_str) + + # 写入步长信息 + strides_str = f"Strides: {tuple(tensor.stride())}\n" + f.write(strides_str) + + # 获取PyTorch风格的字符串表示(不进行类型转换) + tensor_str = str(tensor.detach().cpu()) + + # 写入张量数据 + f.write(tensor_str) + + def max_context_len(self): + return self.meta.dctx + + def create_kv_cache(self): + return self.qwen3vl_model.create_kv_cache( + self.meta.nlayer, + self.meta.dctx, + self.meta.nkvh, + self.meta.dh, + self.meta.dh, + self.meta.dt_logits, + self.device, + self.dev_ids, + self.ndev, + ) + + def drop_kv_cache(self, kv_cache): + self.qwen3vl_model.drop_kv_cache(kv_cache) + + def batch_infer_one_round(self, tasks: List[InferTask], image_path: str | None = None, video_path: str | None = None): + output = (c_uint * len(tasks))() + batch_inputs = Qwen3VLBatchedTask(tasks, image_path=image_path, video_path=video_path, config=self.config) + self.qwen3vl_model.infer_batch( + self.model_instance, + *(batch_inputs.input_args()), + output, + ) + return list(output) + + def generate(self, input_content, max_steps, topp_=1.0, topk_=1, temperature_=1.0, image_path=None, video_path=None): + # 如果有图片,需要在内容中添加图片占位符 + if image_path is not None: + input_content = f"<|vision_start|><|image_pad|><|vision_end|>{input_content}" + + input_content = self.tokenizer.apply_chat_template( + conversation=[{"role": "user", "content": input_content}], + add_generation_prompt=True, + tokenize=False, + ) + print(input_content, end="", flush=True) + tokens = self.tokenizer.encode(input_content) + # print(f"[DEBUG] Generated tokens: {tokens}") + infer_task = InferTask( + 0, + tokens, + self.max_context_len(), + temperature_, + topk_, + topp_, + self.eos_token_id, + ) + infer_task.bind_kvcache(KVCache(self)) + + steps = 0 + total_time = 0 + output_content = "" + + for step_i in range(max_steps): + start_time = time.time() + # prefill: step 0,传入image/video;decode:后续步不传 + output_tokens = self.batch_infer_one_round( + [infer_task], + image_path=image_path if step_i == 0 else None, + video_path=video_path if step_i == 0 else None, + ) + end_time = time.time() + steps += 1 + # output_str = ( + # self.tokenizer._tokenizer.id_to_token(output_tokens[0]) + # .replace("▁", " ") + # .replace("<0x0A>", "\n") + # ) + output_str = self.tokenizer.decode(output_tokens[0]) + output_content += output_str + print(f"[DEBUG] Step {step_i}: token_id={output_tokens[0]}, token='{output_str}', eos_tokens={self.eos_token_id}") + print(output_str, end="", flush=True) + if output_tokens[0] in self.eos_token_id: + print(f"[DEBUG] EOS token detected, breaking generation loop") + break + infer_task.next(output_tokens[0]) + + if step_i > 0: + total_time += end_time - start_time + + print("\n") + avg_time = total_time * 1000 / (steps - 1) + print(f"Time per step: {avg_time:.3f}ms") + + infer_task._kv_cache.drop(self) + return output_content, avg_time + + def perplexity(self, test_sequences: List[Sequence[int]], batch_size=10): + tasks = [InferTask(i, [], self.max_context_len(), 1.0, 1, 1.0, self.eos_token_id) for i in range(batch_size)] + kv_caches = [KVCache(self) for _ in range(batch_size)] + + nll = 0.0 + total_len = 0 + + for i in range(0, len(test_sequences), batch_size): + batch_id = 0 + true_tokens = [] + while batch_id < batch_size and batch_id + i < len(test_sequences): + input_tokens = test_sequences[i + batch_id][:-1] + true_tokens.extend(test_sequences[i + batch_id][1:]) + tasks[batch_id].tokens = input_tokens + tasks[batch_id].bind_kvcache(kv_caches[batch_id]) + batch_id += 1 + + batch_inputs = Qwen3VLBatchedTask(tasks[:batch_id], image_path=None, config=self.config) + logits = torch.zeros( + (batch_inputs.ntok, self.meta.dvoc), dtype=self.meta.torch_dtype_logits + ) + # 评测路径:decode阶段不传像素;传递pos_ids以保持mrope输入稳定 + # 简化forward_batch调用:使用input_args()展开,但需要插入decode专用的空pixel_values + args = list(batch_inputs.input_args()) + args[11] = c_void_p(0) # 替换pixel_values为空指针(decode阶段不传像素) + + self.qwen3vl_model.forward_batch( + self.model_instance, + *args, + logits.data_ptr(), + ) + + logits = logits.float() + token_ids = torch.tensor(true_tokens, dtype=torch.int64) # [ntok,] + log_probs = torch.nn.functional.log_softmax( + logits, dim=-1) # (ntok, vocab) + token_logprobs = log_probs[ + torch.arange(batch_inputs.ntok), token_ids + ] # (ntok,) + + start = 0 + for l in batch_inputs.req_lens_list: + nll += -token_logprobs[start: start + l].sum().item() + start += l + total_len += token_logprobs.numel() + + for task in tasks: + task.release_kvcache() + + return math.exp(nll / total_len) + + def destroy_model_instance(self): + self.qwen3vl_model.destroy_model(self.model_instance) + print("Model destroyed") + + +def test(): + if len(sys.argv) < 3: + print( + "Usage: python qwen3vl.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore] [n_device]" + ) + sys.exit(1) + model_path = sys.argv[2] + device_type = DeviceType.DEVICE_TYPE_CPU + if sys.argv[1] == "--cpu": + device_type = DeviceType.DEVICE_TYPE_CPU + elif sys.argv[1] == "--nvidia": + device_type = DeviceType.DEVICE_TYPE_NVIDIA + elif sys.argv[1] == "--cambricon": + device_type = DeviceType.DEVICE_TYPE_CAMBRICON + elif sys.argv[1] == "--ascend": + device_type = DeviceType.DEVICE_TYPE_ASCEND + elif sys.argv[1] == "--metax": + device_type = DeviceType.DEVICE_TYPE_METAX + elif sys.argv[1] == "--moore": + device_type = DeviceType.DEVICE_TYPE_MOORE + elif sys.argv[1] == "--iluvatar": + device_type = DeviceType.DEVICE_TYPE_ILUVATAR + else: + print( + "Usage: python qwen3vl.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore] [n_device]" + ) + sys.exit(1) + + # 首先测试 pos_ids 计算 + # test_pos_ids_calculation() + + ndev = int(sys.argv[3]) if len(sys.argv) > 3 else 1 + max_tokens = 1024 + model = Qwen3VLForCausalLM(model_path, device_type, ndev, max_tokens=max_tokens) + image_path = "/home/cearx/qy/model/Qwen3-VL-2B-Vit-86M-0828/image3.jpg" + model.generate("描述这张图片", 500, image_path=image_path) + model.destroy_model_instance() + + +def test_pos_ids_calculation(): + """测试 2D MRoPE pos_ids 计算""" + print("=== 测试 2D MRoPE pos_ids 计算 ===") + + # 测试图像路径 + image_path = "/home/cearx/qy/model/Qwen3-VL-2B-Vit-86M-0828/image3.jpg" + + try: + # 预处理图像 + pixel_values, grid_thw = preprocess_image_qwen3vl(image_path) + print(f"图像预处理完成:") + print(f" pixel_values shape: {pixel_values.shape}") + print(f" grid_thw: {grid_thw}") + + # 计算 pos_ids + pos_ids = compute_2d_mrope_pos_ids(grid_thw) + print(f"pos_ids 计算完成:") + print(f" pos_ids shape: {pos_ids.shape}") + print(f" pos_ids 前10个元素:") + print(f" {pos_ids[:10]}") + print(f" pos_ids 最后10个元素:") + print(f" {pos_ids[-10:]}") + + # 验证 pos_ids 的合理性 + t, h, w = grid_thw[0].tolist() + # grid_thw 现在已经是 spatial merge 后的网格大小 + expected_patches = t * h * w + actual_patches = pos_ids.shape[0] + print(f"期望 patch 数量: {expected_patches}") + print(f"实际 patch 数量: {actual_patches}") + + # 检查 pos_ids 的值范围 + h_max = pos_ids[:, 0].max().item() + w_max = pos_ids[:, 1].max().item() + print(f"pos_ids 高度范围: 0 到 {h_max}") + print(f"pos_ids 宽度范围: 0 到 {w_max}") + + # 坐标范围应该对应 grid_thw 的范围 + expected_h_max = h - 1 + expected_w_max = w - 1 + print(f"预期高度范围: 0 到 {expected_h_max}") + print(f"预期宽度范围: 0 到 {expected_w_max}") + + if expected_patches == actual_patches: + print("✓ pos_ids 数量验证通过!") + else: + print("✗ pos_ids 数量验证失败!") + print(f" 详细信息: grid_thw={grid_thw}, 期望={expected_patches}, 实际={actual_patches}") + + # 检查坐标范围是否正确 + if h_max == expected_h_max and w_max == expected_w_max: + print("✓ pos_ids 坐标范围验证通过!") + else: + print("✗ pos_ids 坐标范围验证失败!") + print(f" 实际坐标最大值: h={h_max}, w={w_max}") + print(f" 期望坐标最大值: h={expected_h_max}, w={expected_w_max}") + + except Exception as e: + print(f"测试失败: {e}") + import traceback + traceback.print_exc() + + +if __name__ == "__main__": + test() diff --git a/src/cache_manager/opcache_manager.hpp b/src/cache_manager/opcache_manager.hpp index 4c49e961..96e9df05 100644 --- a/src/cache_manager/opcache_manager.hpp +++ b/src/cache_manager/opcache_manager.hpp @@ -9,6 +9,9 @@ #include "../tensor.hpp" #include "../utils.hpp" #include "infinicore_infer.h" +#include "infiniop/ops/conv.h" +#include "infiniop/ops/gelu.h" +#include "infiniop/ops/layer_norm.h" class IDescriptorDestroyer { public: @@ -154,25 +157,37 @@ class CacheManager { public: DECLARE_OP_CACHE(Add) DECLARE_OP_CACHE(RMSNorm) + DECLARE_OP_CACHE(LayerNorm) DECLARE_OP_CACHE(Gemm) DECLARE_OP_CACHE(RoPE) + DECLARE_OP_CACHE(MRoPE2D) + DECLARE_OP_CACHE(MRoPE3D) DECLARE_OP_CACHE(Rearrange) DECLARE_OP_CACHE(CausalSoftmax) + DECLARE_OP_CACHE(Softmax) DECLARE_OP_CACHE(Topkrouter) DECLARE_OP_CACHE(SwiGLU) DECLARE_OP_CACHE(RandomSample) DECLARE_OP_CACHE(DequantizeAWQ) + DECLARE_OP_CACHE(Conv) + DECLARE_OP_CACHE(Gelu) CacheManager(size_t capacity = 100) : Add_cache(capacity, DESTROY_FUNC(Add)), RMSNorm_cache(capacity, DESTROY_FUNC(RMSNorm)), + LayerNorm_cache(capacity, DESTROY_FUNC(LayerNorm)), Gemm_cache(capacity, DESTROY_FUNC(Gemm)), RoPE_cache(capacity, DESTROY_FUNC(RoPE)), + MRoPE2D_cache(capacity, DESTROY_FUNC(MRoPE2D)), + MRoPE3D_cache(capacity, DESTROY_FUNC(MRoPE3D)), Rearrange_cache(capacity, DESTROY_FUNC(Rearrange)), CausalSoftmax_cache(capacity, DESTROY_FUNC(CausalSoftmax)), + Softmax_cache(capacity, DESTROY_FUNC(Softmax)), Topkrouter_cache(capacity, DESTROY_FUNC(Topkrouter)), SwiGLU_cache(capacity, DESTROY_FUNC(SwiGLU)), RandomSample_cache(capacity, DESTROY_FUNC(RandomSample)), + Conv_cache(capacity, DESTROY_FUNC(Conv)), + Gelu_cache(capacity, DESTROY_FUNC(Gelu)), DequantizeAWQ_cache(capacity, DESTROY_FUNC(DequantizeAWQ)) {} template diff --git a/src/models/inference_context.cpp b/src/models/inference_context.cpp index db5fda11..18562cff 100644 --- a/src/models/inference_context.cpp +++ b/src/models/inference_context.cpp @@ -1,6 +1,10 @@ #include "inference_context.hpp" #include "../tensor.hpp" #include "../utils.hpp" +#include "infiniop/ops/conv.h" +#include "infiniop/ops/gelu.h" +#include "infiniop/ops/layer_norm.h" +#include "infiniop/ops/softmax.h" InferenceContext::InferenceContext(infiniopHandle_t op_handle_, std::shared_ptr memory_pool_, CacheManager *cache_manager, infinirtStream_t stream) : op_handle(op_handle_), memory_pool(memory_pool_), cache_manager(cache_manager), stream(stream) {} @@ -56,6 +60,34 @@ void InferenceContext::rmsnorm(std::shared_ptr y, y->data(), x->data(), w->data(), stream)); } +void InferenceContext::layernorm(std::shared_ptr y, + std::shared_ptr input_standardization, + std::shared_ptr input_std_deviation, + std::shared_ptr x, + std::shared_ptr w, + std::shared_ptr b, + float epsilon) { + size_t key = CacheManager::createDescriptorKey(y, input_standardization, input_std_deviation, x, w, b); + + infiniopLayerNormDescriptor_t desc; + if (!cache_manager->getLayerNormDescriptor(key, desc)) { + RUN_INFINI(infiniopCreateLayerNormDescriptor( + op_handle, &desc, y->desc(), input_standardization->desc(), input_std_deviation->desc(), + x->desc(), w->desc(), b ? b->desc() : nullptr, epsilon)); + cache_manager->putLayerNormDescriptor(key, desc); + } + + size_t workspace_size = 0; + RUN_INFINI(infiniopGetLayerNormWorkspaceSize(desc, &workspace_size)); + ensure_workspace(workspace_size); + void *workspace = workspace_storage->memory(); + + RUN_INFINI(infiniopLayerNorm( + desc, workspace, workspace_size, + y->data(), input_standardization->data(), input_std_deviation->data(), + x->data(), w->data(), b ? b->data() : nullptr, stream)); +} + void InferenceContext::gemm(std::shared_ptr c, std::shared_ptr a, std::shared_ptr b, @@ -123,6 +155,80 @@ void InferenceContext::rope(std::shared_ptr q, sin->data(), cos->data(), stream)); } +void InferenceContext::mrope_2d(std::shared_ptr q, + std::shared_ptr k, + std::shared_ptr pos, + std::shared_ptr sin, + std::shared_ptr cos) { + size_t key = CacheManager::createDescriptorKey(q, k, pos, sin, cos); + + infiniopMRoPE2DDescriptor_t desc; + if (!cache_manager->getMRoPE2DDescriptor(key, desc)) { + RUN_INFINI(infiniopCreateMRoPE2DDescriptor( + op_handle, &desc, q->desc(), k->desc(), + pos->desc(), sin->desc(), cos->desc())); + cache_manager->putMRoPE2DDescriptor(key, desc); + } + + size_t workspace_size = 0; + RUN_INFINI(infiniopGetMRoPE2DWorkspaceSize(desc, &workspace_size)); + ensure_workspace(workspace_size); + void *workspace = workspace_storage->memory(); + + RUN_INFINI(infiniopMRoPE2D( + desc, workspace, workspace_size, + q->data(), k->data(), pos->data(), + sin->data(), cos->data(), stream)); +} + +void InferenceContext::mrope_3d(std::shared_ptr q, + std::shared_ptr k, + std::shared_ptr pos, + std::shared_ptr sin, + std::shared_ptr cos, + std::shared_ptr rope_section) { + size_t key = CacheManager::createDescriptorKey(q, k, pos, sin, cos, rope_section); + + infiniopMRoPE3DDescriptor_t desc; + if (!cache_manager->getMRoPE3DDescriptor(key, desc)) { + RUN_INFINI(infiniopCreateMRoPE3DDescriptor( + op_handle, &desc, q->desc(), k->desc(), + pos->desc(), sin->desc(), cos->desc(), + rope_section->desc())); + cache_manager->putMRoPE3DDescriptor(key, desc); + } + + size_t workspace_size = 0; + RUN_INFINI(infiniopGetMRoPE3DWorkspaceSize(desc, &workspace_size)); + ensure_workspace(workspace_size); + void *workspace = workspace_storage->memory(); + + RUN_INFINI(infiniopMRoPE3D( + desc, workspace, workspace_size, + q->data(), k->data(), pos->data(), + sin->data(), cos->data(), rope_section->data(), stream)); +} + +void InferenceContext::softmax(std::shared_ptr y, + std::shared_ptr x) { + size_t key = CacheManager::createDescriptorKey(y, x); + + infiniopSoftmaxDescriptor_t desc; + if (!cache_manager->getSoftmaxDescriptor(key, desc)) { + RUN_INFINI(infiniopCreateSoftmaxDescriptor( + op_handle, &desc, y->desc(), x->desc())); + cache_manager->putSoftmaxDescriptor(key, desc); + } + + size_t workspace_size = 0; + RUN_INFINI(infiniopGetSoftmaxWorkspaceSize(desc, &workspace_size)); + ensure_workspace(workspace_size); + void *workspace = workspace_storage->memory(); + + RUN_INFINI(infiniopSoftmax(desc, workspace, workspace_size, + y->data(), x->data(), stream)); +} + void InferenceContext::causalSoftmax(std::shared_ptr y, std::shared_ptr x) { size_t key = CacheManager::createDescriptorKey(y, x); @@ -281,3 +387,53 @@ void InferenceContext::dequant(std::shared_ptr weight, desc, workspace, workspace_size, weight->data(), in_w->data(), in_s->data(), in_z->data(), stream)); } + +void InferenceContext::conv3d(std::shared_ptr output, + std::shared_ptr input, + std::shared_ptr weight, + std::shared_ptr bias, + const std::vector &pads, + const std::vector &strides, + const std::vector &dilations) { + size_t key = CacheManager::createDescriptorKey(output, input, weight, bias); + + infiniopConvDescriptor_t desc; + if (!cache_manager->getConvDescriptor(key, desc)) { + RUN_INFINI(infiniopCreateConvDescriptor( + op_handle, &desc, + output->desc(), input->desc(), weight->desc(), bias->desc(), + const_cast(pads.data()), + const_cast(strides.data()), + const_cast(dilations.data()), + pads.size())); + cache_manager->putConvDescriptor(key, desc); + } + + size_t workspace_size = 0; + RUN_INFINI(infiniopGetConvWorkspaceSize(desc, &workspace_size)); + ensure_workspace(workspace_size); + void *workspace = workspace_storage->memory(); + + RUN_INFINI(infiniopConv( + desc, workspace, workspace_size, + output->data(), input->data(), weight->data(), + bias ? bias->data() : nullptr, stream)); +} + +void InferenceContext::gelu(std::shared_ptr output, + std::shared_ptr input) { + size_t key = CacheManager::createDescriptorKey(output, input); + + infiniopGeluDescriptor_t desc; + if (!cache_manager->getGeluDescriptor(key, desc)) { + RUN_INFINI(infiniopCreateGeluDescriptor(op_handle, &desc, output->desc(), input->desc())); + cache_manager->putGeluDescriptor(key, desc); + } + + size_t workspace_size = 0; + RUN_INFINI(infiniopGetGeluWorkspaceSize(desc, &workspace_size)); + ensure_workspace(workspace_size); + void *workspace = workspace_storage->memory(); + + RUN_INFINI(infiniopGelu(desc, workspace, workspace_size, output->data(), input->data(), stream)); +} diff --git a/src/models/inference_context.hpp b/src/models/inference_context.hpp index 0cf93f6f..bd2e3d5e 100644 --- a/src/models/inference_context.hpp +++ b/src/models/inference_context.hpp @@ -23,6 +23,13 @@ struct InferenceContext { std::shared_ptr x, std::shared_ptr w, float epsilon); + void layernorm(std::shared_ptr y, + std::shared_ptr input_standardization, + std::shared_ptr input_std_deviation, + std::shared_ptr x, + std::shared_ptr w, + std::shared_ptr b, + float epsilon); void gemm(std::shared_ptr c, std::shared_ptr a, std::shared_ptr b, @@ -35,8 +42,21 @@ struct InferenceContext { std::shared_ptr sin, std::shared_ptr cos, infiniopRoPEAlgo_t algo); + void mrope_2d(std::shared_ptr q, + std::shared_ptr k, + std::shared_ptr pos, + std::shared_ptr sin, + std::shared_ptr cos); + void mrope_3d(std::shared_ptr q, + std::shared_ptr k, + std::shared_ptr pos, + std::shared_ptr sin, + std::shared_ptr cos, + std::shared_ptr rope_section); void causalSoftmax(std::shared_ptr y, std::shared_ptr x); + void softmax(std::shared_ptr y, + std::shared_ptr x); void topkrouter(std::shared_ptr values, // F32 std::shared_ptr indices, // I32 @@ -52,6 +72,16 @@ struct InferenceContext { std::shared_ptr prob, float random_val, float top_p, uint32_t top_k, float temperature); + void conv3d(std::shared_ptr output, + std::shared_ptr input, + std::shared_ptr weight, + std::shared_ptr bias, + const std::vector &pads, + const std::vector &strides, + const std::vector &dilations); + + void gelu(std::shared_ptr output, + std::shared_ptr input); void linear(std::shared_ptr c, std::shared_ptr a, std::shared_ptr b, @@ -86,6 +116,12 @@ inline void rmsnorm(std::shared_ptr y, std::shared_ptr x, getInferenceContext().rmsnorm(y, x, w, epsilon); } +inline void layernorm(std::shared_ptr y, std::shared_ptr input_standardization, + std::shared_ptr input_std_deviation, std::shared_ptr x, + std::shared_ptr w, std::shared_ptr b, float epsilon) { + getInferenceContext().layernorm(y, input_standardization, input_std_deviation, x, w, b, epsilon); +} + inline void gemm(std::shared_ptr c, std::shared_ptr a, std::shared_ptr b, float alpha, float beta) { getInferenceContext().gemm(c, a, b, alpha, beta); @@ -107,10 +143,26 @@ inline void rope_v2(std::shared_ptr q, std::shared_ptr k, getInferenceContext().rope(q, k, pos, sin, cos, INFINIOP_ROPE_ALGO_GPT_NEOX); } +inline void mrope_2d(std::shared_ptr q, std::shared_ptr k, + std::shared_ptr pos, std::shared_ptr sin, + std::shared_ptr cos) { + getInferenceContext().mrope_2d(q, k, pos, sin, cos); +} + +inline void mrope_3d(std::shared_ptr q, std::shared_ptr k, + std::shared_ptr pos, std::shared_ptr sin, + std::shared_ptr cos, std::shared_ptr rope_section) { + getInferenceContext().mrope_3d(q, k, pos, sin, cos, rope_section); +} + inline void causalSoftmax(std::shared_ptr y, std::shared_ptr x) { getInferenceContext().causalSoftmax(y, x); } +inline void softmax(std::shared_ptr y, std::shared_ptr x) { + getInferenceContext().softmax(y, x); +} + inline void topkrouter(std::shared_ptr values, // F32 std::shared_ptr indices, // I32 std::shared_ptr x, @@ -136,6 +188,16 @@ inline void randomSample(std::shared_ptr out, std::shared_ptr pr getInferenceContext().randomSample(out, prob, random_val, top_p, top_k, temperature); } +inline void conv3d(std::shared_ptr output, + std::shared_ptr input, + std::shared_ptr weight, + std::shared_ptr bias, + const std::vector &pads, + const std::vector &strides, + const std::vector &dilations) { + getInferenceContext().conv3d(output, input, weight, bias, pads, strides, dilations); +} + inline void linear(std::shared_ptr c, std::shared_ptr a, std::shared_ptr b, float alpha, float beta, std::shared_ptr residual, std::shared_ptr bias) { diff --git a/src/models/qwen3_vl/qwen3_vl.cpp b/src/models/qwen3_vl/qwen3_vl.cpp new file mode 100644 index 00000000..95e9c101 --- /dev/null +++ b/src/models/qwen3_vl/qwen3_vl.cpp @@ -0,0 +1,1093 @@ +#include "qwen3_vl.hpp" + +#include "../../tensor.hpp" +#include "../../utils.hpp" +#include "../inference_context.hpp" + +#include +#include +#include +#include + +// 条件编译调试宏 +#ifdef DEBUG_VISION +#define DEBUG_PRINT(fmt, ...) printf("[DEBUG] " fmt "\n", ##__VA_ARGS__) +#else +#define DEBUG_PRINT(fmt, ...) \ + do { \ + } while (0) +#endif + +// 常量定义 +namespace Qwen3VLConstants { +constexpr uint32_t SPATIAL_MERGE_SIZE = 2; +constexpr uint32_t MERGE_UNIT = SPATIAL_MERGE_SIZE * SPATIAL_MERGE_SIZE; // 4 +constexpr uint32_t IN_CHANNELS = 3; +constexpr uint32_t TEMPORAL_PATCH_SIZE = 2; +constexpr uint32_t PATCH_SIZE = 16; +constexpr uint32_t VISION_MLP_EXPANSION = 4; // vision_hidden_size * 4 +constexpr uint32_t MAX_DEEPSTACK_LAYERS = 3; +constexpr uint32_t ROPE_SECTION_SIZE = 3; +constexpr uint32_t POS_IDS_2D_SIZE = 2; +constexpr uint32_t LLM_POS_IDS_3D_SIZE = 3; +// constexpr float EPSILON_DEFAULT = 1e-6f; // 暂时未使用,保留备用 +} // namespace Qwen3VLConstants + +inline void createDeviceResource(DeviceResource *rsrc, const Qwen3VLMeta *meta, + std::shared_ptr weights, + infiniDevice_t device, int idev, + int ndev, int dev_id, + infinicclComm_t comm) { + RUN_INFINI(infinirtSetDevice(device, dev_id)); + infiniopHandle_t handle; + infiniopCreateHandle(&handle); + infinirtStream_t stream; + infinirtStreamCreate(&stream); + + auto memory_pool = std::make_shared(128 * 1024 * 1024); + + *rsrc = DeviceResource{ + device, + dev_id, + handle, + weights, + stream, + comm, + memory_pool, + }; + RUN_INFINI(infinirtDeviceSynchronize()); +} + +inline void releaseDeviceResource(DeviceResource &res) { + infinirtDeviceSynchronize(); + // Release individual Tensors + + infiniopDestroyHandle(res.handle); + res.handle = nullptr; + infinirtStreamDestroy(res.stream); + res.stream = nullptr; + infinicclCommDestroy(res.comm); + res.comm = nullptr; +} + +std::tuple, uint32_t> inferVision(const Qwen3VLMeta *meta, DeviceResource &rsrc, + const float *pixel_values, uint32_t num_patches, + const uint32_t *pos_ids, uint32_t pos_ids_len, + const uint32_t *llm_pos_ids, uint32_t llm_pos_ids_len, + const uint32_t *rope_section, uint32_t rope_section_len, int ndev) { + auto d = meta->d; + auto dt_logits = meta->dt_logits; + auto stream = rsrc.stream; + auto weight = rsrc.weights; + + // 若有视觉输入,先跑ViT得到visual_embeds;随后构建logits_in(视觉token用visual_embeds替换) + std::shared_ptr vision_pos_ids_buf; // for ViT [patches, 2] + std::shared_ptr llm_pos_ids_buf; // for LLM [patches+text_len, 3] + std::shared_ptr rope_section_buf; // rope_section [3,] + if (pos_ids != nullptr && pos_ids_len > 0) { + assert(pos_ids_len % Qwen3VLConstants::POS_IDS_2D_SIZE == 0 && "pos_ids_len must be even for 2D mRoPE [patches, 2] format"); + assert(num_patches > 0 && "num_patches cannot be zero for 2D mRoPE"); + + vision_pos_ids_buf = (rsrc.device == INFINI_DEVICE_CPU) + ? Tensor::weight(const_cast(pos_ids), INFINI_DTYPE_U32, {num_patches, 2}) + : Tensor::buffer(INFINI_DTYPE_U32, {num_patches, 2}, rsrc.memory_pool); + if (rsrc.device != INFINI_DEVICE_CPU) { + RUN_INFINI(infinirtMemcpyAsync(vision_pos_ids_buf->data(), pos_ids, sizeof(uint32_t) * pos_ids_len, + INFINIRT_MEMCPY_H2D, stream)); + } + } + + // LLM 3D mRoPE参数处理:验证并构建llm_pos_ids和rope_section缓冲区 + if (llm_pos_ids != nullptr && llm_pos_ids_len > 0) { + assert(llm_pos_ids_len % Qwen3VLConstants::LLM_POS_IDS_3D_SIZE == 0 && "llm_pos_ids_len must be divisible by 3 for 3D mRoPE [patches+text_len, 3] format"); + uint32_t total_tokens = llm_pos_ids_len / Qwen3VLConstants::LLM_POS_IDS_3D_SIZE; + llm_pos_ids_buf = (rsrc.device == INFINI_DEVICE_CPU) + ? Tensor::weight(const_cast(llm_pos_ids), INFINI_DTYPE_U32, {total_tokens, 3}) + : Tensor::buffer(INFINI_DTYPE_U32, {total_tokens, 3}, rsrc.memory_pool); + if (rsrc.device != INFINI_DEVICE_CPU) { + RUN_INFINI(infinirtMemcpyAsync(llm_pos_ids_buf->data(), llm_pos_ids, sizeof(uint32_t) * llm_pos_ids_len, + INFINIRT_MEMCPY_H2D, stream)); + } + } + + if (rope_section != nullptr && rope_section_len > 0) { + assert(rope_section_len == Qwen3VLConstants::ROPE_SECTION_SIZE && "rope_section_len must be exactly 3 for [t, h, w] format"); + rope_section_buf = (rsrc.device == INFINI_DEVICE_CPU) + ? Tensor::weight(const_cast(rope_section), INFINI_DTYPE_U32, {3}) + : Tensor::buffer(INFINI_DTYPE_U32, {Qwen3VLConstants::ROPE_SECTION_SIZE}, rsrc.memory_pool); + if (rsrc.device != INFINI_DEVICE_CPU) { + RUN_INFINI(infinirtMemcpyAsync(rope_section_buf->data(), rope_section, sizeof(uint32_t) * Qwen3VLConstants::ROPE_SECTION_SIZE, + INFINIRT_MEMCPY_H2D, stream)); + } + } + + std::shared_ptr visual_embeds; // [num_patches, vision_hidden_size] + // Deepstack特征提取层索引 + // todo: 从config读取,默认[3,6,9] + std::vector deepstack_layers = {3, 6, 9}; + std::vector> deepstack_features; + + DEBUG_PRINT("Vision processing: num_patches=%u", num_patches); + // ===================1.patch_embd=================== + // todo py端读入进meta里 + // 根据权重形状确定实际参数: [vision_hidden_size, 3, temporal, patch, patch] + uint32_t in_channels = Qwen3VLConstants::IN_CHANNELS; + uint32_t temporal_patch_size = Qwen3VLConstants::TEMPORAL_PATCH_SIZE; + uint32_t patch_size = Qwen3VLConstants::PATCH_SIZE; + uint32_t vision_hidden_size = static_cast(meta->vision_hidden_size); + uint32_t patch_feature_dim = in_channels * temporal_patch_size * patch_size * patch_size; + + // 检查原始输入数据 + DEBUG_PRINT("=== 原始输入数据检查 ==="); + DEBUG_PRINT("num_patches=%u, in_channels=%u, temporal_patch_size=%u, patch_size=%u", + num_patches, in_channels, temporal_patch_size, patch_size); + DEBUG_PRINT("vision_hidden_size=%u, patch_feature_dim=%u", vision_hidden_size, patch_feature_dim); + + // // 按 num_patches 行, patch_feature_dim 列打印 pixel_values + // DEBUG_PRINT("pixel_values 按 [num_patches, patch_feature_dim] 打印:"); + // for (uint32_t i = 0; i < num_patches; ++i) { + // std::string row_str = "pixel_values[" + std::to_string(i) + "]:"; + // for (uint32_t j = 0; j < patch_feature_dim; ++j) { + // uint32_t idx = i * patch_feature_dim + j; + // char buf[64]; + // snprintf(buf, sizeof(buf), " %f", pixel_values[idx]); + // row_str += buf; + // } + // printf("%s\n", row_str.c_str()); // 添加这行来实际打印 + // } + + // 输入像素: [num_patches, 3, 2, 16, 16] + std::shared_ptr pixel_values_buf; // 外部传入的 pixel_values 是 float + if (rsrc.device == INFINI_DEVICE_CPU) { + pixel_values_buf = Tensor::weight(const_cast(pixel_values), INFINI_DTYPE_F32, + {num_patches, in_channels, temporal_patch_size, patch_size, patch_size}); + } else { + pixel_values_buf = Tensor::buffer(INFINI_DTYPE_F32, {num_patches, in_channels, temporal_patch_size, patch_size, patch_size}, rsrc.memory_pool); + RUN_INFINI(infinirtMemcpyAsync(pixel_values_buf->data(), pixel_values, + sizeof(float) * num_patches * patch_feature_dim, + INFINIRT_MEMCPY_H2D, stream)); + } + // conv buffer & config + auto conv_output = Tensor::buffer(dt_logits, {num_patches, vision_hidden_size, 1, 1, 1}, rsrc.memory_pool); + std::vector pads = {0, 0, 0}; + std::vector strides = {int64_t(temporal_patch_size), int64_t(patch_size), int64_t(patch_size)}; // strides = kernel_size + std::vector dilations = {1, 1, 1}; + // 检查 conv3d 的输入数据 + // DEBUG_PRINT("=== conv3d 输入数据检查 ==="); + // DEBUG_PRINT("pixel_values_buf 信息:"); + // pixel_values_buf->debug(); + + // patch_embed 权重 + // DEBUG_PRINT("weight->w_v_patch_embed_proj[0] 信息:"); + // weight->w_v_patch_embed_proj[0]->debug(); + // DEBUG_PRINT("weight->b_v_patch_embed_proj[0] 信息:"); + // weight->b_v_patch_embed_proj[0]->debug(); + + // pos_embed 权重 + // DEBUG_PRINT("weight->w_v_pos_embed[0] 信息:"); + // weight->w_v_pos_embed[0]->debug(); + + // merger 权重 + // DEBUG_PRINT("weight->w_v_merger_ln_q[0] 信息:"); + // weight->w_v_merger_ln_q[0]->debug(); + // DEBUG_PRINT("weight->b_v_merger_ln_q[0] 信息:"); + // weight->b_v_merger_ln_q[0]->debug(); + // DEBUG_PRINT("weight->w_v_merger_mlp_0[0] 信息:"); + // weight->w_v_merger_mlp_0[0]->debug(); + // DEBUG_PRINT("weight->b_v_merger_mlp_0[0] 信息:"); + // weight->b_v_merger_mlp_0[0]->debug(); + // DEBUG_PRINT("weight->w_v_merger_mlp_2[0] 信息:"); + // weight->w_v_merger_mlp_2[0]->debug(); + // DEBUG_PRINT("weight->b_v_merger_mlp_2[0] 信息:"); + // weight->b_v_merger_mlp_2[0]->debug(); + + // // merger_list 权重 (只打印第一个) + // DEBUG_PRINT("weight->w_v_merger_list_0_ln_q[0] 信息:"); + // weight->w_v_merger_list_0_ln_q[0]->debug(); + // DEBUG_PRINT("weight->b_v_merger_list_0_ln_q[0] 信息:"); + // weight->b_v_merger_list_0_ln_q[0]->debug(); + // DEBUG_PRINT("weight->w_v_merger_list_0_mlp_0[0] 信息:"); + // weight->w_v_merger_list_0_mlp_0[0]->debug(); + // DEBUG_PRINT("weight->b_v_merger_list_0_mlp_0[0] 信息:"); + // weight->b_v_merger_list_0_mlp_0[0]->debug(); + // DEBUG_PRINT("weight->w_v_merger_list_0_mlp_2[0] 信息:"); + // weight->w_v_merger_list_0_mlp_2[0]->debug(); + // DEBUG_PRINT("weight->b_v_merger_list_0_mlp_2[0] 信息:"); + // weight->b_v_merger_list_0_mlp_2[0]->debug(); + + // // block0 权重和偏置 + // DEBUG_PRINT("weight->w_v_norm1[0] 信息:"); + // weight->w_v_norm1[0]->debug(); + // DEBUG_PRINT("weight->b_v_norm1[0] 信息:"); + // weight->b_v_norm1[0]->debug(); + // DEBUG_PRINT("weight->w_v_attn_proj[0] 信息:"); + // weight->w_v_attn_proj[0]->debug(); + // DEBUG_PRINT("weight->b_v_attn_proj[0] 信息:"); + // weight->b_v_attn_proj[0]->debug(); + DEBUG_PRINT("weight->w_v_attn_qkv[0] 信息:"); + weight->w_v_attn_qkv[0]->debug(); + DEBUG_PRINT("weight->b_v_attn_qkv[0] 信息:"); + weight->b_v_attn_qkv[0]->debug(); + // DEBUG_PRINT("weight->w_v_norm2[0] 信息:"); + // weight->w_v_norm2[0]->debug(); + // DEBUG_PRINT("weight->b_v_norm2[0] 信息:"); + // weight->b_v_norm2[0]->debug(); + // DEBUG_PRINT("weight->w_v_mlp_fc1[0] 信息:"); + // weight->w_v_mlp_fc1[0]->debug(); + // DEBUG_PRINT("weight->b_v_mlp_fc1[0] 信息:"); + // weight->b_v_mlp_fc1[0]->debug(); + // DEBUG_PRINT("weight->w_v_mlp_fc2[0] 信息:"); + // weight->w_v_mlp_fc2[0]->debug(); + // DEBUG_PRINT("weight->b_v_mlp_fc2[0] 信息:"); + // weight->b_v_mlp_fc2[0]->debug(); + + DEBUG_PRINT("conv3d 参数: pads=[%ld,%ld,%ld], strides=[%ld,%ld,%ld], dilations=[%ld,%ld,%ld]", + pads[0], pads[1], pads[2], strides[0], strides[1], strides[2], + dilations[0], dilations[1], dilations[2]); + + DEBUG_PRINT("conv_output 形状: [%zu,%zu,%zu,%zu,%zu]", + conv_output->shape()[0], conv_output->shape()[1], conv_output->shape()[2], + conv_output->shape()[3], conv_output->shape()[4]); + + // // patch_embd + // conv3d(conv_output, + // pixel_values_buf, + // weight->w_v_patch_embed_proj[0], + // weight->b_v_patch_embed_proj[0], + // pads, strides, dilations); + + // // 打印 conv3d 的结果 + // DEBUG_PRINT("=== conv3d 输出结果 ==="); + // conv_output->debug(); + + exit(0); + + auto vit_hidden = conv_output->view({num_patches, vision_hidden_size}); + + // ===================2.abs_pos_embd=================== + if (weight->w_v_pos_embed.size() > 0 && weight->w_v_pos_embed[0]) { + // todo: 实现fast_pos_embed_interpolate的完整版本 + // 对于单图推理,我们使用线性插值来调整位置编码到当前图像尺寸 + auto pos_embed_weight = weight->w_v_pos_embed[0]; // [num_pos, vision_hidden_size] + auto pos_embed_out = Tensor::buffer(dt_logits, {num_patches, vision_hidden_size}, rsrc.memory_pool); + + // 简化版本:直接取前num_patches个位置编码(假设位置编码表足够大) + uint32_t available_pos = std::min(num_patches, static_cast(pos_embed_weight->shape()[0])); + if (available_pos > 0) { + RUN_INFINI(infinirtMemcpyAsync( + pos_embed_out->data(), + pos_embed_weight->data(), + dsize(dt_logits) * available_pos * vision_hidden_size, + INFINIRT_MEMCPY_D2D, stream)); + + // 如果num_patches > available_pos,用零填充剩余部分 + if (num_patches > available_pos) { + auto zero_tensor = Tensor::buffer(dt_logits, {(num_patches - available_pos), vision_hidden_size}, rsrc.memory_pool); + // 将零张量数据复制到位置编码输出的剩余部分 + RUN_INFINI(infinirtMemcpyAsync( + pos_embed_out->data(available_pos * vision_hidden_size), + zero_tensor->data(), + dsize(dt_logits) * (num_patches - available_pos) * vision_hidden_size, + INFINIRT_MEMCPY_D2D, stream)); + } + + // 添加位置编码到patch embeddings: vit_hidden = vit_hidden + pos_embeds + add(vit_hidden, vit_hidden, pos_embed_out); + } + } + + // ===================3.vit_blocks=================== + uint32_t vision_layers = static_cast(meta->vision_layers); + uint32_t vision_heads = static_cast(meta->vision_heads); + uint32_t dh_v = vision_hidden_size / vision_heads; + assert(dh_v * vision_heads == vision_hidden_size); + + DEBUG_PRINT("ViT configuration: layers=%u, heads=%u, hidden_size=%u, dh_v=%u", + vision_layers, vision_heads, vision_hidden_size, dh_v); + + // 缓冲区 + auto vit_hidden_in = Tensor::buffer(dt_logits, {num_patches, vision_hidden_size}, rsrc.memory_pool); + auto vit_hidden_out = Tensor::buffer(dt_logits, {num_patches, vision_hidden_size}, rsrc.memory_pool); + vit_hidden_in->copyFrom(vit_hidden, rsrc.handle, stream); + + auto vit_qkv = Tensor::buffer(dt_logits, {num_patches, 3u * vision_hidden_size}, rsrc.memory_pool); + auto vit_q = Tensor::buffer(dt_logits, {num_patches, vision_hidden_size}, rsrc.memory_pool); + auto vit_k = Tensor::buffer(dt_logits, {num_patches, vision_hidden_size}, rsrc.memory_pool); + auto vit_v = Tensor::buffer(dt_logits, {num_patches, vision_hidden_size}, rsrc.memory_pool); + + auto qk_v_buf = Tensor::buffer(dt_logits, {vision_heads, num_patches, num_patches}, rsrc.memory_pool); + auto attn_val_v = Tensor::buffer(dt_logits, {vision_heads, num_patches, dh_v}, rsrc.memory_pool); + + // ===================3.1 attention(2d mRoPE)=================== + for (uint32_t vlayer = 0; vlayer < vision_layers; ++vlayer) { + DEBUG_PRINT("ViT processing layer %u/%u", vlayer + 1, vision_layers); + // ViT norm1: 在 [num_patches, 1, vision_hidden_size] 上做 LayerNorm + { + auto norm1_in_3d = vit_hidden_in->view({num_patches, 1u, vision_hidden_size}); + auto norm1_out_3d = Tensor::buffer(dt_logits, {num_patches, 1u, vision_hidden_size}, rsrc.memory_pool); + auto norm1_input_standardization_3d = Tensor::buffer(dt_logits, {num_patches, 1u, vision_hidden_size}, rsrc.memory_pool); + auto norm1_input_std_deviation_2d = Tensor::buffer(dt_logits, {num_patches, 1u}, rsrc.memory_pool); + layernorm(norm1_out_3d, + norm1_input_standardization_3d, + norm1_input_std_deviation_2d, + norm1_in_3d, + weight->w_v_norm1[vlayer], + weight->b_v_norm1[vlayer], + meta->epsilon); + rearrange(vit_hidden_out, norm1_out_3d->view({num_patches, vision_hidden_size})); + } + // QKV + linear(vit_qkv, vit_hidden_out, weight->w_v_attn_qkv[vlayer], 1.0, 0.0, nullptr, weight->b_v_attn_qkv[vlayer]); + // split q,k,v + rearrange(vit_q, vit_qkv->slice(1, 0, vision_hidden_size)); + rearrange(vit_k, vit_qkv->slice(1, vision_hidden_size, vision_hidden_size)); + rearrange(vit_v, vit_qkv->slice(1, 2u * vision_hidden_size, vision_hidden_size)); + + // 2D mRoPE on q,k: 使用 ViT 专用 (h,w) 位置编码 + assert(vision_pos_ids_buf != nullptr && "vision_pos_ids_buf cannot be nullptr"); + auto q_view = vit_q->view({vision_heads, num_patches, dh_v}); + auto k_view = vit_k->view({vision_heads, num_patches, dh_v}); + mrope_2d(q_view, q_view, vision_pos_ids_buf, weight->sin_table_v, weight->cos_table_v); + mrope_2d(k_view, k_view, vision_pos_ids_buf, weight->sin_table_v, weight->cos_table_v); + + // Self-Attention: QK^T -> softmax -> *V + { + auto q_view = vit_q->view({vision_heads, num_patches, dh_v}); + auto k_view = vit_k->view({vision_heads, dh_v, num_patches}); + auto qk_view = qk_v_buf->view({vision_heads, num_patches, num_patches}); + linear(qk_view, q_view, k_view, 1.f / float(sqrt(dh_v)), 0.f, nullptr, nullptr); + // ViT 使用非因果 softmax (无 mask) + softmax(qk_view, qk_view); + auto v_view = vit_v->view({vision_heads, num_patches, dh_v}); + linear(attn_val_v, qk_view, v_view, 1.f, 0.f, nullptr, nullptr); + } + + // 合并 heads:[heads, num_patches, dh_v] -> [num_patches, heads*dh_v] + auto attn_rearranged = Tensor::buffer(dt_logits, {num_patches, vision_hidden_size}, rsrc.memory_pool); + auto attn_perm = attn_val_v->permute({1, 0, 2}); + // 变连续 + auto attn_contig = Tensor::buffer(dt_logits, {num_patches, vision_heads, dh_v}, rsrc.memory_pool); + rearrange(attn_contig, attn_perm); + // view + auto attn_view = attn_contig->view({num_patches, vision_hidden_size}); + rearrange(attn_rearranged, attn_view); + + // out proj + 残差 + linear(vit_hidden_in, attn_rearranged, weight->w_v_attn_proj[vlayer], 1.0, 0.0, vit_hidden_in, weight->b_v_attn_proj[vlayer]); + + // ===================3.2 ffn=================== + // FFN 的 norm2: 在 [num_patches, 1, vision_hidden_size] 上做 LayerNorm + { + auto norm2_in_3d = vit_hidden_in->view({num_patches, 1u, vision_hidden_size}); + auto norm2_out_3d = Tensor::buffer(dt_logits, {num_patches, 1u, vision_hidden_size}, rsrc.memory_pool); + auto norm2_input_standardization_3d = Tensor::buffer(dt_logits, {num_patches, 1u, vision_hidden_size}, rsrc.memory_pool); + auto norm2_input_std_deviation_2d = Tensor::buffer(dt_logits, {num_patches, 1u}, rsrc.memory_pool); + layernorm(norm2_out_3d, + norm2_input_standardization_3d, + norm2_input_std_deviation_2d, + norm2_in_3d, + weight->w_v_norm2[vlayer], + weight->b_v_norm2[vlayer], + meta->epsilon); + rearrange(vit_hidden_out, norm2_out_3d->view({num_patches, vision_hidden_size})); + } + auto vit_fc1 = Tensor::buffer(dt_logits, {num_patches, Qwen3VLConstants::VISION_MLP_EXPANSION * vision_hidden_size}, rsrc.memory_pool); + linear(vit_fc1, vit_hidden_out, weight->w_v_mlp_fc1[vlayer], 1.0, 0.0, nullptr, weight->b_v_mlp_fc1[vlayer]); + auto vit_gelu = Tensor::buffer(dt_logits, {num_patches, Qwen3VLConstants::VISION_MLP_EXPANSION * vision_hidden_size}, rsrc.memory_pool); + DEBUG_PRINT("ViT layer %u: applying GELU activation (should be gelu_pytorch_tanh)", vlayer); + getInferenceContext().gelu(vit_gelu, vit_fc1); + auto vit_fc2 = Tensor::buffer(dt_logits, {num_patches, vision_hidden_size}, rsrc.memory_pool); + linear(vit_fc2, vit_gelu, weight->w_v_mlp_fc2[vlayer], 1.0, 0.0, nullptr, weight->b_v_mlp_fc2[vlayer]); + // 残差 + rearrange(vit_hidden_in, vit_fc2->view({num_patches, vision_hidden_size})); + + // ===================3.3 deepstack_merger=================== + // 在指定层提取deepstack特征 + if (std::find(deepstack_layers.begin(), deepstack_layers.end(), vlayer) != deepstack_layers.end()) { + size_t deepstack_idx = std::find(deepstack_layers.begin(), deepstack_layers.end(), vlayer) - deepstack_layers.begin(); + DEBUG_PRINT("Deepstack feature extraction at layer %u (deepstack_idx=%zu)", vlayer, deepstack_idx); + + if (deepstack_idx < Qwen3VLConstants::MAX_DEEPSTACK_LAYERS) { // 最多3个deepstack层 + const auto &w_ln_q = deepstack_idx == 0 ? weight->w_v_merger_list_0_ln_q + : deepstack_idx == 1 ? weight->w_v_merger_list_1_ln_q + : weight->w_v_merger_list_2_ln_q; + const auto &b_ln_q = deepstack_idx == 0 ? weight->b_v_merger_list_0_ln_q + : deepstack_idx == 1 ? weight->b_v_merger_list_1_ln_q + : weight->b_v_merger_list_2_ln_q; + const auto &w_mlp_0 = deepstack_idx == 0 ? weight->w_v_merger_list_0_mlp_0 + : deepstack_idx == 1 ? weight->w_v_merger_list_1_mlp_0 + : weight->w_v_merger_list_2_mlp_0; + const auto &b_mlp_0 = deepstack_idx == 0 ? weight->b_v_merger_list_0_mlp_0 + : deepstack_idx == 1 ? weight->b_v_merger_list_1_mlp_0 + : weight->b_v_merger_list_2_mlp_0; + const auto &w_mlp_2 = deepstack_idx == 0 ? weight->w_v_merger_list_0_mlp_2 + : deepstack_idx == 1 ? weight->w_v_merger_list_1_mlp_2 + : weight->w_v_merger_list_2_mlp_2; + const auto &b_mlp_2 = deepstack_idx == 0 ? weight->b_v_merger_list_0_mlp_2 + : deepstack_idx == 1 ? weight->b_v_merger_list_1_mlp_2 + : weight->b_v_merger_list_2_mlp_2; + + // Deepstack merger:view->norm->MLP + // use_postshuffle_norm=true: ln_q(x.view(-1, self.hidden_size)) + const uint32_t merge_unit = Qwen3VLConstants::MERGE_UNIT; + uint32_t num_groups = num_patches / merge_unit; + uint32_t hidden_size_merged = vision_hidden_size * merge_unit; + assert(num_patches >= merge_unit && weight->w_v_merger_mlp_0.size() > 0); + + // view:四合一 + auto ds_input = vit_hidden_in->view({num_groups, hidden_size_merged}); + + // LayerNorm:以 [batch, channel, feature] 形式调用, batch=num_groups, channel=1, feature=hidden_size_merged + auto ds_input_3d = ds_input->view({num_groups, 1u, hidden_size_merged}); + auto ds_norm_3d = Tensor::buffer(dt_logits, {num_groups, 1u, hidden_size_merged}, rsrc.memory_pool); + auto ds_input_standardization_3d = Tensor::buffer(dt_logits, {num_groups, 1u, hidden_size_merged}, rsrc.memory_pool); + auto ds_input_std_deviation_2d = Tensor::buffer(dt_logits, {num_groups, 1u}, rsrc.memory_pool); + layernorm(ds_norm_3d, ds_input_standardization_3d, ds_input_std_deviation_2d, ds_input_3d, w_ln_q[0], b_ln_q[0], meta->epsilon); + auto ds_norm = ds_norm_3d->view({num_groups, hidden_size_merged}); + + // MLP: fc1 -> GELU -> fc2 + auto ds_fc1 = Tensor::buffer(dt_logits, {num_groups, hidden_size_merged}, rsrc.memory_pool); + linear(ds_fc1, ds_norm, w_mlp_0[0]->permute({1, 0}), 1.0, 0.0, nullptr, b_mlp_0[0]); + + auto ds_gelu = Tensor::buffer(dt_logits, {num_groups, hidden_size_merged}, rsrc.memory_pool); + DEBUG_PRINT("Deepstack merger %zu: applying GELU activation", deepstack_idx); + getInferenceContext().gelu(ds_gelu, ds_fc1); + + auto ds_out = Tensor::buffer(dt_logits, {num_groups, d}, rsrc.memory_pool); + linear(ds_out, ds_gelu, w_mlp_2[0]->permute({1, 0}), 1.0, 0.0, nullptr, b_mlp_2[0]); + deepstack_features.push_back(ds_out); + } + } + } + // vit 输出 + visual_embeds = vit_hidden_in; + + // ===================4.merger=================== + // 主 Merger: norm->view->MLP + // use_postshuffle_norm=false: ln_q(x).view(-1, self.hidden_size) + DEBUG_PRINT("Starting Vision Merger processing: deepstack_features.size()=%zu", deepstack_features.size()); + const uint32_t merge_unit = Qwen3VLConstants::MERGE_UNIT; + uint32_t num_groups = num_patches / merge_unit; + uint32_t hidden_size_merged = vision_hidden_size * merge_unit; + assert(num_patches >= merge_unit && weight->w_v_merger_mlp_0.size() > 0); + + // LayerNorm:以 [batch, channel, feature] 形式调用, batch=num_patches, channel=1, feature=vision_hidden_size + auto merger_ln_in_3d = visual_embeds->view({num_patches, 1u, vision_hidden_size}); + auto merger_ln_out_3d = Tensor::buffer(dt_logits, {num_patches, 1u, vision_hidden_size}, rsrc.memory_pool); + auto merger_ln_standardization_3d = Tensor::buffer(dt_logits, {num_patches, 1u, vision_hidden_size}, rsrc.memory_pool); + auto merger_ln_stddev_2d = Tensor::buffer(dt_logits, {num_patches, 1u}, rsrc.memory_pool); + layernorm(merger_ln_out_3d, + merger_ln_standardization_3d, + merger_ln_stddev_2d, + merger_ln_in_3d, + weight->w_v_merger_ln_q[0], + weight->b_v_merger_ln_q[0], + meta->epsilon); + + // view:四合一 + auto merger_in = merger_ln_out_3d->view({num_groups, hidden_size_merged}); + + // MLP: fc1 -> GELU -> fc2 + auto merger_fc1 = Tensor::buffer(dt_logits, {num_groups, hidden_size_merged}, rsrc.memory_pool); + linear(merger_fc1, merger_in, weight->w_v_merger_mlp_0[0]->permute({1, 0}), 1.0, 0.0, nullptr, weight->b_v_merger_mlp_0[0]); + + auto merger_gelu = Tensor::buffer(dt_logits, {num_groups, hidden_size_merged}, rsrc.memory_pool); + DEBUG_PRINT("Main merger: applying GELU activation"); + getInferenceContext().gelu(merger_gelu, merger_fc1); + + auto merger_out = Tensor::buffer(dt_logits, {num_groups, d}, rsrc.memory_pool); + linear(merger_out, merger_gelu, weight->w_v_merger_mlp_2[0]->permute({1, 0}), 1.0, 0.0, nullptr, weight->b_v_merger_mlp_2[0]); + + // ===================4.1 merger concat=================== + // 主merger和deepstack特征连接: [main_features] + deepstack_features (在特征维度连接) + assert(!deepstack_features.empty()); + uint32_t total_dim = d * (1 + deepstack_features.size()); + auto concat_embeds = Tensor::buffer(dt_logits, {num_groups, total_dim}, rsrc.memory_pool); + + // 复制主特征 + RUN_INFINI(infinirtMemcpyAsync( + concat_embeds->data(), + merger_out->data(), + dsize(dt_logits) * num_groups * d, INFINIRT_MEMCPY_D2D, stream)); + + // 复制deepstack特征(按照提取顺序) + for (size_t i = 0; i < deepstack_features.size(); ++i) { + RUN_INFINI(infinirtMemcpyAsync( + concat_embeds->data((i + 1) * num_groups * d), + deepstack_features[i]->data(), + dsize(dt_logits) * num_groups * d, INFINIRT_MEMCPY_D2D, stream)); + } + + visual_embeds = concat_embeds; + + // 四合一后num_patches=num_groups + num_patches = num_groups; + + return std::make_tuple(visual_embeds, num_groups); +} + +void inferDeviceBatch(const Qwen3VLMeta *meta, DeviceResource &rsrc, + uint32_t idev, uint32_t ndev, + const uint32_t *tokens, uint32_t ntok, + const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos, + const uint32_t *pos_ids, uint32_t pos_ids_len, + const uint32_t *llm_pos_ids, uint32_t llm_pos_ids_len, + const uint32_t *rope_section, uint32_t rope_section_len, + const float *pixel_values, uint32_t /*is_vision_mode*/, // 视觉数据指针,是否视觉模式不再需要 + struct KVCache **kv_caches, + const float *temperature, const uint32_t *topk, const float *topp, + uint32_t *output, void *last_logits) { + // DEBUG: 推理开始 + // printf("[DEBUG] Qwen3VL inferDeviceBatch START: idev=%u, ntok=%u, nreq=%u, has_vision=%s\n", + // idev, ntok, nreq, (pixel_values != nullptr) ? "true" : "false"); + auto nlayer = meta->nlayer; + auto nkvh = meta->nkvh / ndev; + auto nh = meta->nh / ndev; + auto ngroup = nh / nkvh; + // auto dctx = meta.dctx; + auto dh = meta->dh; + auto d = meta->d; + auto dt_logits = meta->dt_logits; + auto di = meta->di / ndev; + auto dvoc = meta->dvoc; + auto stream = rsrc.stream; + auto weight = rsrc.weights; + bool has_qkv_bias = meta->has_qkv_bias; + + // 判断是否为prefill阶段(有vision且是第一次前向) + bool has_vision = (pixel_values != nullptr); + bool is_prefill = has_vision && (req_pos[0] == 0); + + // 计算实际patches数量 + uint32_t num_patches = 0; + if (pos_ids != nullptr && pos_ids_len > 0) { + num_patches = pos_ids_len / Qwen3VLConstants::POS_IDS_2D_SIZE; + } + + uint32_t llm_ntok = is_prefill ? (ntok - 1 + num_patches) : ntok; // prefill: 14 - 1 image token + 600 patches = 613 + // printf("[DEBUG] is_prefill=%s, ntok=%u, llm_ntok=%u, num_patches=%u\n", + // is_prefill ? "true" : "false", ntok, llm_ntok, num_patches); + DEBUG_PRINT("is_prefill=%s, ntok=%u, llm_ntok=%u, num_patches=%u", + is_prefill ? "true" : "false", ntok, llm_ntok, num_patches); + + std::shared_ptr vision_pos_ids_buf; // for ViT [patches, 2] + std::shared_ptr llm_pos_ids_buf; // for LLM [patches+text_len, 3] - TODO: wire from API + std::shared_ptr rope_section_buf; // rope_section [3,] - TODO: wire from API + if (pos_ids != nullptr && pos_ids_len > 0) { + assert(pos_ids_len % Qwen3VLConstants::POS_IDS_2D_SIZE == 0 && "pos_ids_len must be even for 2D mRoPE [patches, 2] format"); + assert(num_patches > 0 && "num_patches cannot be zero for 2D mRoPE"); + + vision_pos_ids_buf = (rsrc.device == INFINI_DEVICE_CPU) + ? Tensor::weight(const_cast(pos_ids), INFINI_DTYPE_U32, {num_patches, 2}) + : Tensor::buffer(INFINI_DTYPE_U32, {num_patches, 2}, rsrc.memory_pool); + if (rsrc.device != INFINI_DEVICE_CPU) { + RUN_INFINI(infinirtMemcpyAsync(vision_pos_ids_buf->data(), pos_ids, sizeof(uint32_t) * pos_ids_len, + INFINIRT_MEMCPY_H2D, stream)); + } + } + + // LLM 3D mRoPE参数处理:验证并构建llm_pos_ids和rope_section缓冲区 + if (llm_pos_ids != nullptr && llm_pos_ids_len > 0) { + assert(llm_pos_ids_len % Qwen3VLConstants::LLM_POS_IDS_3D_SIZE == 0 && "llm_pos_ids_len must be divisible by 3 for 3D mRoPE [patches+text_len, 3] format"); + uint32_t total_tokens = llm_pos_ids_len / Qwen3VLConstants::LLM_POS_IDS_3D_SIZE; + llm_pos_ids_buf = (rsrc.device == INFINI_DEVICE_CPU) + ? Tensor::weight(const_cast(llm_pos_ids), INFINI_DTYPE_U32, {total_tokens, 3}) + : Tensor::buffer(INFINI_DTYPE_U32, {total_tokens, 3}, rsrc.memory_pool); + if (rsrc.device != INFINI_DEVICE_CPU) { + RUN_INFINI(infinirtMemcpyAsync(llm_pos_ids_buf->data(), llm_pos_ids, sizeof(uint32_t) * llm_pos_ids_len, + INFINIRT_MEMCPY_H2D, stream)); + } + } + + if (rope_section != nullptr && rope_section_len > 0) { + assert(rope_section_len == Qwen3VLConstants::ROPE_SECTION_SIZE && "rope_section_len must be exactly 3 for [t, h, w] format"); + rope_section_buf = (rsrc.device == INFINI_DEVICE_CPU) + ? Tensor::weight(const_cast(rope_section), INFINI_DTYPE_U32, {3}) + : Tensor::buffer(INFINI_DTYPE_U32, {Qwen3VLConstants::ROPE_SECTION_SIZE}, rsrc.memory_pool); + if (rsrc.device != INFINI_DEVICE_CPU) { + RUN_INFINI(infinirtMemcpyAsync(rope_section_buf->data(), rope_section, sizeof(uint32_t) * Qwen3VLConstants::ROPE_SECTION_SIZE, + INFINIRT_MEMCPY_H2D, stream)); + } + } + + // Allocate buffers + auto logits_in = Tensor::buffer(dt_logits, {llm_ntok, d}, rsrc.memory_pool); + auto logits_out = Tensor::buffer(dt_logits, {llm_ntok, d}, rsrc.memory_pool); + auto q_buf = Tensor::buffer(dt_logits, {llm_ntok, nh * dh}, rsrc.memory_pool); + auto k_buf = Tensor::buffer(dt_logits, {llm_ntok, nkvh * dh}, rsrc.memory_pool); + auto v_buf = Tensor::buffer(dt_logits, {llm_ntok, nkvh * dh}, rsrc.memory_pool); + + auto gate_buf = Tensor::buffer(dt_logits, {llm_ntok, di}, rsrc.memory_pool); + auto up_buf = Tensor::buffer(dt_logits, {llm_ntok, di}, rsrc.memory_pool); + + auto o_buf = Tensor::buffer(dt_logits, {llm_ntok, nh * dh}, rsrc.memory_pool); + auto prob_buf = Tensor::buffer(dt_logits, {nreq, dvoc}, rsrc.memory_pool); + auto result_buf = Tensor::buffer(INFINI_DTYPE_I64, {nreq}, rsrc.memory_pool); + auto result_cpu = std::vector(nreq); + + // Prepare inputs + auto batch_pos_ids = std::vector(ntok); + size_t req_start = 0; + for (uint32_t req = 0; req < nreq; req++) { + for (uint32_t i = 0; i < req_lens[req]; i++) { + batch_pos_ids[req_start + i] = req_pos[req] + i; + } + req_start += req_lens[req]; + } + + // vision infer + std::shared_ptr visual_embeds; + if (has_vision && is_prefill) { + auto [embeds, output_patches] = inferVision(meta, rsrc, pixel_values, num_patches, pos_ids, pos_ids_len, llm_pos_ids, llm_pos_ids_len, rope_section, rope_section_len, ndev); + visual_embeds = embeds; + num_patches = output_patches; + } else { + visual_embeds = nullptr; + num_patches = 0; + } + + // img_embd 和 text_embd 拼接构建 logits_in + if (is_prefill) { + // Prefill阶段:文本token查表,视觉token用 visual_embeds 顺序展开 + size_t vis_idx = 0; + uint32_t out_idx = 0; + for (uint32_t i = 0; i < ntok; i++) { + const bool is_image_tok = (meta->image_token_id != 0 && tokens[i] == meta->image_token_id); + const bool is_video_tok = (meta->video_token_id != 0 && tokens[i] == meta->video_token_id); + if (has_vision && (is_image_tok || is_video_tok) && visual_embeds) { + // 将一个vision token展开为num_patches个patch + for (size_t patch_idx = 0; patch_idx < num_patches && vis_idx < num_patches; patch_idx++, vis_idx++, out_idx++) { + uint32_t copy_dim = std::min(d, static_cast(visual_embeds->shape()[1])); + if (copy_dim > 0) { + RUN_INFINI(infinirtMemcpyAsync( + logits_in->data(out_idx * d), + visual_embeds->data(vis_idx * visual_embeds->shape()[1]), + dsize(dt_logits) * copy_dim, INFINIRT_MEMCPY_D2D, stream)); + } + } + } else { + RUN_INFINI(infinirtMemcpyAsync( + logits_in->data(out_idx * d), + weight->w_in_embd->data(tokens[i] * d), + dsize(dt_logits) * d, INFINIRT_MEMCPY_D2D, stream)); + out_idx++; + } + } + } else { + // Decode阶段:直接使用text token查表 + for (uint32_t i = 0; i < ntok; i++) { + RUN_INFINI(infinirtMemcpyAsync( + logits_in->data(i * d), + weight->w_in_embd->data(tokens[i] * d), + dsize(dt_logits) * d, INFINIRT_MEMCPY_D2D, stream)); + } + } + + // Attention + // attention inner + size_t max_qk_size = 0; + size_t max_seq_len = 0; + + for (uint32_t req = 0; req < nreq; req++) { + auto past_len = req_pos[req]; + auto seq_len = req_lens[req]; + auto total_len = past_len + seq_len; + + max_qk_size = std::max(max_qk_size, size_t(seq_len * total_len)); + max_seq_len = std::max(max_seq_len, size_t(seq_len)); + } + + auto qk_buf = Tensor::buffer(dt_logits, {nh * max_qk_size}, rsrc.memory_pool); + auto rearrange_q_buf = Tensor::buffer(dt_logits, {nkvh, ngroup * max_seq_len, dh}, rsrc.memory_pool); + auto q_rearrange = rearrange_q_buf->view({nkvh, ngroup, max_seq_len, dh}); + auto attn_val_buf = Tensor::buffer(dt_logits, {nkvh, ngroup * max_seq_len, dh}, rsrc.memory_pool); + auto attn_val_gemm = attn_val_buf->view({nkvh, ngroup, max_seq_len, dh}); + + // Compute llm + DEBUG_PRINT("Starting LLM processing: %zu layers", (size_t)nlayer); + for (uint32_t layer = 0; layer < nlayer; layer++) { + DEBUG_PRINT("LLM processing layer %u/%zu", layer + 1, (size_t)nlayer); + // 1. Attention + // rms norm + rmsnorm(logits_out, logits_in, weight->w_attn_norm[layer], meta->epsilon); + // qkv_proj + linear(q_buf, logits_out, + weight->w_attn_q[layer], + 1.0, 0.0, nullptr, has_qkv_bias ? weight->b_attn_q[layer] : nullptr); + linear(k_buf, logits_out, + weight->w_attn_k[layer], + 1.0, 0.0, nullptr, has_qkv_bias ? weight->b_attn_k[layer] : nullptr); + // q/k-norm + if (weight->w_q_norm.size() > layer && weight->w_k_norm.size() > layer) { + auto q_norm_buf = Tensor::buffer(dt_logits, {llm_ntok, nh * dh}, rsrc.memory_pool); + auto k_norm_buf = Tensor::buffer(dt_logits, {llm_ntok, nkvh * dh}, rsrc.memory_pool); + rmsnorm(q_norm_buf, q_buf, weight->w_q_norm[layer], meta->epsilon); + rmsnorm(k_norm_buf, k_buf, weight->w_k_norm[layer], meta->epsilon); + rearrange(q_buf, q_norm_buf); + rearrange(k_buf, k_norm_buf); + } + linear(v_buf, logits_out, + weight->w_attn_v[layer], + 1.0, 0.0, nullptr, has_qkv_bias ? weight->b_attn_v[layer] : nullptr); + // RoPE处理:prefill阶段使用3D MRoPE,decode阶段使用普通RoPE + if (is_prefill && llm_pos_ids_buf && rope_section_buf) { + // Prefill阶段:3D MRoPE + // printf("[DEBUG] MRoPE3D参数检查:\n"); + auto q_view = q_buf->view({nh, llm_ntok, dh}); + auto k_view = k_buf->view({nkvh, llm_ntok, dh}); + // printf("[DEBUG] q维度: [%zu, %zu, %zu] (nhead=%zu, llm_seqlen=%u, dhead=%zu)\n", + // q_view->shape()[0], q_view->shape()[1], q_view->shape()[2], nh, llm_ntok, dh); + // printf("[DEBUG] k维度: [%zu, %zu, %zu] (nkvh=%zu, llm_seqlen=%u, dhead=%zu)\n", + // k_view->shape()[0], k_view->shape()[1], k_view->shape()[2], nkvh, llm_ntok, dh); + // printf("[DEBUG] pos维度: [%zu, %zu]\n", llm_pos_ids_buf->shape()[0], llm_pos_ids_buf->shape()[1]); + // printf("[DEBUG] sin维度: [%zu, %zu]\n", weight->sin_table->shape()[0], weight->sin_table->shape()[1]); + // printf("[DEBUG] cos维度: [%zu, %zu]\n", weight->cos_table->shape()[0], weight->cos_table->shape()[1]); + // printf("[DEBUG] rope_section维度: [%zu]\n", rope_section_buf->shape()[0]); + + mrope_3d(q_view, q_view, llm_pos_ids_buf, weight->sin_table, weight->cos_table, rope_section_buf); + mrope_3d(k_view, k_view, llm_pos_ids_buf, weight->sin_table, weight->cos_table, rope_section_buf); + } else if (!is_prefill) { + // Decode阶段:普通RoPE,使用当前位置 + // printf("[DEBUG] 使用普通RoPE for decode阶段\n"); + auto pos_buf = Tensor::buffer(INFINI_DTYPE_U32, {llm_ntok}, rsrc.memory_pool); + // decode时位置来自req_pos + current_step + uint32_t current_pos = req_pos[0] + req_lens[0] - 1; // 当前生成位置 + uint32_t pos_data = current_pos; + RUN_INFINI(infinirtMemcpyAsync(pos_buf->data(), &pos_data, sizeof(uint32_t), INFINIRT_MEMCPY_H2D, stream)); + + // 添加decode阶段RoPE调试信息 + // printf("[DEBUG] Decode RoPE 参数调试:\n"); + // printf("[DEBUG] req_pos[0]=%u, req_lens[0]=%u, current_pos=%u\n", req_pos[0], req_lens[0], current_pos); + // printf("[DEBUG] nh=%zu, nkvh=%zu, llm_ntok=%u, dh=%zu\n", nh, nkvh, llm_ntok, dh); + + auto q_view = q_buf->view({llm_ntok, nh, dh}); + auto k_view = k_buf->view({llm_ntok, nkvh, dh}); + // printf("[DEBUG] q_view维度: [%zu, %zu, %zu]\n", q_view->shape()[0], q_view->shape()[1], q_view->shape()[2]); + // printf("[DEBUG] k_view维度: [%zu, %zu, %zu]\n", k_view->shape()[0], k_view->shape()[1], k_view->shape()[2]); + // printf("[DEBUG] pos_buf维度: [%zu]\n", pos_buf->shape()[0]); + // printf("[DEBUG] sin_table维度: [%zu, %zu]\n", weight->sin_table->shape()[0], weight->sin_table->shape()[1]); + // printf("[DEBUG] cos_table维度: [%zu, %zu]\n", weight->cos_table->shape()[0], weight->cos_table->shape()[1]); + + // // RoPE维度一致性检查 + // printf("[DEBUG] RoPE维度检查:\n"); + // printf("[DEBUG] seqlen(llm_ntok)=%u, nhead(nh)=%zu, dhead(dh)=%zu\n", llm_ntok, nh, dh); + // printf("[DEBUG] table_len(sin[0])=%zu, table_dim(sin[1])=%zu\n", weight->sin_table->shape()[0], weight->sin_table->shape()[1]); + // printf("[DEBUG] pos_seqlen=%zu\n", pos_buf->shape()[0]); + + rope(q_view, q_view, pos_buf, weight->sin_table, weight->cos_table); + rope(k_view, k_view, pos_buf, weight->sin_table, weight->cos_table); + } + + size_t token_offset = 0; + for (uint32_t req = 0; req < nreq; req++) { + auto past_len = req_pos[req]; + auto seq_len = req_lens[req]; + auto total_len = past_len + seq_len; + // printf("[DEBUG] KV Cache - req=%u: past_len=%u, seq_len=%u, total_len=%u\n", req, past_len, seq_len, total_len); + auto o = o_buf->slice({{0, token_offset, seq_len}})->view({seq_len, nkvh, ngroup, dh})->permute({1, 2, 0, 3}); + auto q = q_buf->slice({{0, token_offset, seq_len}})->view({seq_len, nkvh, ngroup, dh})->permute({1, 2, 0, 3}); + auto k = k_buf->slice({{0, token_offset, seq_len}})->view({seq_len, nkvh, dh}); + auto v = v_buf->slice({{0, token_offset, seq_len}})->view({seq_len, nkvh, dh}); + + // self attention + // concat + rearrange(kv_caches[req]->k[idev][layer]->slice(0, past_len, seq_len), k); + rearrange(kv_caches[req]->v[idev][layer]->slice(0, past_len, seq_len), v); + // qk + rearrange(q_rearrange->slice(2, 0, seq_len), q); + auto qk_gemm = qk_buf->slice(0, 0, nh * seq_len * total_len)->view({nkvh, ngroup * seq_len, total_len}); + auto k_gemm = kv_caches[req]->k[idev][layer]->slice(0, 0, total_len)->permute({1, 2, 0}); + linear(qk_gemm, rearrange_q_buf->slice(1, 0, ngroup * seq_len), k_gemm, 1.f / float(sqrt(dh)), 0.f, nullptr, nullptr); + // softmax + auto qk_softmax = qk_gemm->view({nh, seq_len, total_len}); + causalSoftmax(qk_softmax, qk_softmax); + auto v_gemm = kv_caches[req]->v[idev][layer]->slice(0, 0, total_len)->permute({1, 0, 2}); + linear(attn_val_buf->slice(1, 0, ngroup * seq_len), qk_gemm, v_gemm, 1.f, 0.f, nullptr, nullptr); + // rearrange attn val + rearrange(o, attn_val_gemm->slice(2, 0, seq_len)); + + token_offset += seq_len; + } + + // o_proj + linear(logits_in, o_buf, weight->w_attn_out[layer], + 1.0, 0.0, idev == 0 ? logits_in : nullptr, nullptr); // only rank 0 adds residual + // All_reduce if distributed + if (rsrc.comm != nullptr) { + RUN_INFINI(infinicclAllReduce( + logits_in->data(), logits_in->data(), llm_ntok * d, dt_logits, + INFINICCL_SUM, rsrc.comm, stream)); + RUN_INFINI(infinirtStreamSynchronize(stream)); + } + + // 2. FFN + rmsnorm(logits_out, logits_in, weight->w_ffn_norm[layer], meta->epsilon); + linear(gate_buf, logits_out, + weight->w_ffn_gate[layer], + 1.0, 0.0, nullptr, nullptr); + linear(up_buf, logits_out, + weight->w_ffn_up[layer], + 1.0, 0.0, nullptr, nullptr); + DEBUG_PRINT("LLM layer %u: applying SwiGLU activation (SiLU-based)", layer); + swiglu(gate_buf, up_buf, gate_buf); + linear(logits_in, gate_buf, + weight->w_ffn_down[layer], + 1.0, 0.0, idev == 0 ? logits_in : nullptr, nullptr); // only rank 0 adds residual + // All_reduce if distributed + if (rsrc.comm != nullptr) { + RUN_INFINI(infinicclAllReduce( + logits_in->data(), logits_in->data(), llm_ntok * d, dt_logits, + INFINICCL_SUM, rsrc.comm, stream)); + RUN_INFINI(infinirtStreamSynchronize(stream)); + } + } + + // Sample and Output + if (idev == 0) { + if (last_logits != nullptr) { + rmsnorm(logits_out, logits_in, weight->w_out_norm, meta->epsilon); + auto last_logits_buf = Tensor::buffer(dt_logits, {llm_ntok, dvoc}, rsrc.memory_pool); + linear(last_logits_buf, logits_out, weight->w_out_embd, 1.0, 0.0, nullptr, nullptr); + RUN_INFINI(infinirtStreamSynchronize(stream)); + RUN_INFINI(infinirtMemcpy(last_logits, last_logits_buf->data(), dsize(dt_logits) * llm_ntok * dvoc, INFINIRT_MEMCPY_D2H)); + } + if (output != nullptr) { + size_t token_offset = 0; + for (uint32_t req = 0; req < nreq; req++) { + auto seq_len = req_lens[req]; + token_offset += seq_len; + rmsnorm(logits_out->slice(0, req, 1), + logits_in->slice(0, token_offset - 1, 1), + weight->w_out_norm, + meta->epsilon); + } + linear(prob_buf, logits_out->slice(0, 0, nreq), weight->w_out_embd, 1.0, 0.0, nullptr, nullptr); + + // [DEBUG]:在采样前检查prob_buf是否存在NaN/Inf,并统计范围 + RUN_INFINI(infinirtStreamSynchronize(stream)); + auto prob_cpu = std::vector(nreq * dvoc); + RUN_INFINI(infinirtMemcpy(prob_cpu.data(), prob_buf->data(), sizeof(float) * nreq * dvoc, INFINIRT_MEMCPY_D2H)); + size_t nan_count = 0, inf_count = 0; + float global_min = std::numeric_limits::infinity(); + float global_max = -std::numeric_limits::infinity(); + for (size_t i = 0; i < (size_t)nreq * (size_t)dvoc; ++i) { + float v = prob_cpu[i]; + if (!std::isfinite(v)) { + if (std::isnan(v)) { + nan_count++; + } else { + inf_count++; + } + } else { + if (v < global_min) { + global_min = v; + } + if (v > global_max) { + global_max = v; + } + } + } + DEBUG_PRINT("prob_buf stats: nan=%zu, inf=%zu, min=%g, max=%g", nan_count, inf_count, global_min, global_max); + (void)nan_count; + (void)inf_count; // suppress unused variable warnings + + std::random_device _rd; + std::mt19937 gen(_rd()); + token_offset = 0; + for (uint32_t req = 0; req < nreq; req++) { + auto seq_len = req_lens[req]; + float random_val = std::uniform_real_distribution(0, 1)(gen); + randomSample(result_buf->slice(0, req, 1)->view_as({}, {}), + prob_buf->slice(0, req, 1)->view_as({dvoc}, {1}), + random_val, topp[req], topk[req], temperature[req]); + token_offset += seq_len; + } + RUN_INFINI(infinirtStreamSynchronize(stream)); + RUN_INFINI(infinirtMemcpy(result_cpu.data(), result_buf->data(), + sizeof(int64_t) * nreq, INFINIRT_MEMCPY_D2H)); + for (uint32_t req = 0; req < nreq; req++) { + output[req] = uint32_t(result_cpu[req]); + } + } + } + + DEBUG_PRINT("Qwen3VL inferDeviceBatch COMPLETED successfully"); +} + +__C void +inferBatchQwen3VL(struct Qwen3VLModel *model, + const uint32_t *tokens, uint32_t ntok, + const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos, + const uint32_t *pos_ids, uint32_t pos_ids_len, + const uint32_t *llm_pos_ids, uint32_t llm_pos_ids_len, + const uint32_t *rope_section, uint32_t rope_section_len, + const float *pixel_values, + struct KVCache **kv_caches, + const float *temperature, const uint32_t *topk, const float *topp, + uint32_t *output) { + model->req.tokens = tokens; + model->req.ntok = ntok; + model->req.req_lens = req_lens; + model->req.nreq = nreq; + model->req.req_pos = req_pos; + model->req.pos_ids = pos_ids; + model->req.pos_ids_len = pos_ids_len; + model->req.llm_pos_ids = llm_pos_ids; + model->req.llm_pos_ids_len = llm_pos_ids_len; + model->req.rope_section = rope_section; + model->req.rope_section_len = rope_section_len; + model->req.pixel_values = pixel_values; + model->req.kv_caches = kv_caches; + model->req.output = output; + model->req.logits = nullptr; + model->req.temperature = temperature; + model->req.topk = topk; + model->req.topp = topp; + + for (size_t idev = 0; idev < model->dev_ids.size(); idev++) { + std::unique_lock lock(model->states[idev].mtx); + model->states[idev].proceed = true; + lock.unlock(); + model->states[idev].cv_start.notify_one(); + } + for (size_t i = model->dev_ids.size(); i > 0; i--) { + auto idev = i - 1; + std::unique_lock lock(model->states[idev].mtx); + model->states[idev].cv_done.wait(lock, [&] { return !(model->states[idev].proceed); }); + lock.unlock(); + } +} + +__C void +forwardBatchQwen3VL(struct Qwen3VLModel *model, + const uint32_t *tokens, uint32_t ntok, + const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos, + const uint32_t *pos_ids, uint32_t pos_ids_len, + const uint32_t *llm_pos_ids, uint32_t llm_pos_ids_len, + const uint32_t *rope_section, uint32_t rope_section_len, + const float *pixel_values, + struct KVCache **kv_caches, + void *logits) { + model->req.tokens = tokens; + model->req.ntok = ntok; + model->req.req_lens = req_lens; + model->req.nreq = nreq; + model->req.req_pos = req_pos; + model->req.pos_ids = pos_ids; + model->req.pos_ids_len = pos_ids_len; + model->req.llm_pos_ids = llm_pos_ids; + model->req.llm_pos_ids_len = llm_pos_ids_len; + model->req.rope_section = rope_section; + model->req.rope_section_len = rope_section_len; + model->req.pixel_values = pixel_values; + model->req.kv_caches = kv_caches; + model->req.output = nullptr; + model->req.logits = logits; + model->req.temperature = nullptr; + model->req.topk = nullptr; + model->req.topp = nullptr; + + for (size_t idev = 0; idev < model->dev_ids.size(); idev++) { + std::unique_lock lock(model->states[idev].mtx); + model->states[idev].proceed = true; + lock.unlock(); + model->states[idev].cv_start.notify_one(); + } + for (size_t i = model->dev_ids.size(); i > 0; i--) { + auto idev = i - 1; + std::unique_lock lock(model->states[idev].mtx); + model->states[idev].cv_done.wait(lock, [&] { return !(model->states[idev].proceed); }); + lock.unlock(); + } +} + +void launchDevice(const Qwen3VLMeta *meta, std::shared_ptr weights, DeviceResource *rsrc, InferState &state, InferRequest &req, + infiniDevice_t device, int idev, int ndev, int dev_id, infinicclComm_t comm) { + // Create Device Resource + createDeviceResource(rsrc, meta, weights, device, idev, ndev, dev_id, comm); + + CacheManager cache_manager(100); + InferenceContext ctx(rsrc->handle, rsrc->memory_pool, &cache_manager, rsrc->stream); + + // Set the inference context for this thread + setInferenceContext(&ctx); + + { + std::unique_lock lock(state.mtx); + state.loaded = true; + lock.unlock(); + state.cv_load.notify_one(); + } + + // Infer Loop + while (true) { + std::unique_lock lock(state.mtx); + state.cv_start.wait(lock, [&] { return state.proceed || state.exit_flag; }); + // quit if exit_flag is set + if (state.exit_flag) { + break; + } + + inferDeviceBatch(meta, *rsrc, idev, ndev, req.tokens, req.ntok, + req.req_lens, req.nreq, req.req_pos, req.pos_ids, req.pos_ids_len, + req.llm_pos_ids, req.llm_pos_ids_len, req.rope_section, req.rope_section_len, + req.pixel_values, 0, + req.kv_caches, req.temperature, req.topk, req.topp, req.output, req.logits); + + state.proceed = false; + lock.unlock(); + state.cv_done.notify_one(); + } + + // Clean-Up + releaseDeviceResource(*rsrc); + setInferenceContext(nullptr); // Clear the context when done +} + +Qwen3VLModel::Qwen3VLModel(const Qwen3VLMeta *meta, const ModelWeights *weights_) { + auto weights = (Qwen3VLWeights *)(weights_); + device = weights->device(); + dev_ids = weights->devIds(); + int ndev = int(dev_ids.size()); + dev_resources = std::vector(ndev); + states = std::vector(ndev); + threads.resize(ndev); + + auto comms = std::vector(ndev, nullptr); + if (ndev > 1) { + RUN_INFINI(infinicclCommInitAll(device, comms.data(), ndev, dev_ids.data())); + } + + for (int i = 0; i < ndev; i++) { + threads[i] = std::thread(launchDevice, meta, weights->device_weights()[i], &dev_resources[i], std::ref(states[i]), std::ref(req), device, i, ndev, dev_ids[i], comms[i]); + } + for (int i = 0; i < ndev; i++) { + std::unique_lock lock(states[i].mtx); + states[i].cv_load.wait(lock, [&] { return states[i].loaded; }); + lock.unlock(); + } +} + +__C struct Qwen3VLModel * +createQwen3VLModel(const Qwen3VLMeta *meta, + const ModelWeights *weights) { + Qwen3VLModel *model = new Qwen3VLModel(meta, weights); + return model; +} + +__C void destroyQwen3VLModel(struct Qwen3VLModel *model) { + auto ndev = model->dev_resources.size(); + + for (size_t idev = 0; idev < ndev; idev++) { + std::unique_lock lock(model->states[idev].mtx); + model->states[idev].exit_flag = true; + lock.unlock(); + model->states[idev].cv_start.notify_one(); + } + + for (size_t idev = 0; idev < ndev; idev++) { + model->threads[idev].join(); + } + + delete model; +} diff --git a/src/models/qwen3_vl/qwen3_vl.hpp b/src/models/qwen3_vl/qwen3_vl.hpp new file mode 100644 index 00000000..75947fb8 --- /dev/null +++ b/src/models/qwen3_vl/qwen3_vl.hpp @@ -0,0 +1,110 @@ +#pragma once +#include "infinicore_infer/models/qwen3_vl.h" + +#include "../../cache.hpp" +#include "../../dataloader/weights_loader.hpp" + +#include +#include +#include + +struct Qwen3VLDeviceWeight { + std::shared_ptr w_in_embd, w_out_norm, w_out_embd, sin_table, cos_table; + std::vector> w_attn_norm, b_attn_q, b_attn_k, b_attn_v, w_ffn_norm; + std::vector> w_attn_q, w_attn_k, w_attn_v, w_attn_out, w_ffn_gate, w_ffn_up, w_ffn_down; + std::vector> w_q_norm, w_k_norm; + + // Vision encoder weights + std::shared_ptr sin_table_v, cos_table_v; // ViT 专用 2D mRoPE 表 + std::vector> b_v_attn_proj, w_v_attn_proj; + std::vector> b_v_attn_qkv, w_v_attn_qkv; + std::vector> b_v_mlp_fc1, w_v_mlp_fc1; + std::vector> b_v_mlp_fc2, w_v_mlp_fc2; + std::vector> b_v_norm1, w_v_norm1; + std::vector> b_v_norm2, w_v_norm2; + std::vector> b_v_merger_ln_q, w_v_merger_ln_q; + std::vector> b_v_merger_mlp_0, w_v_merger_mlp_0; + std::vector> b_v_merger_mlp_2, w_v_merger_mlp_2; + std::vector> b_v_merger_list_0_ln_q, w_v_merger_list_0_ln_q; + std::vector> b_v_merger_list_0_mlp_0, w_v_merger_list_0_mlp_0; + std::vector> b_v_merger_list_0_mlp_2, w_v_merger_list_0_mlp_2; + std::vector> b_v_merger_list_1_ln_q, w_v_merger_list_1_ln_q; + std::vector> b_v_merger_list_1_mlp_0, w_v_merger_list_1_mlp_0; + std::vector> b_v_merger_list_1_mlp_2, w_v_merger_list_1_mlp_2; + std::vector> b_v_merger_list_2_ln_q, w_v_merger_list_2_ln_q; + std::vector> b_v_merger_list_2_mlp_0, w_v_merger_list_2_mlp_0; + std::vector> b_v_merger_list_2_mlp_2, w_v_merger_list_2_mlp_2; + std::vector> b_v_patch_embed_proj, w_v_patch_embed_proj; + std::vector> w_v_pos_embed; +}; + +class Qwen3VLWeights : public infinicore::weights::Loader { +private: + std::vector> _device_weights; + +public: + Qwen3VLWeights(const Qwen3VLMeta *meta, + infiniDevice_t device, + const std::vector &dev_ids); + std::vector> &device_weights() { + return _device_weights; + } +}; + +struct DeviceResource { + // Device + infiniDevice_t device; + int device_id; + infiniopHandle_t handle; + // Weights + std::shared_ptr weights; + // Streams + infinirtStream_t stream; + // Communicator + infinicclComm_t comm; + + std::shared_ptr memory_pool; +}; + +struct InferRequest { + const uint32_t *tokens; + uint32_t ntok; + const uint32_t *req_lens; + uint32_t nreq; + const uint32_t *req_pos; + const uint32_t *pos_ids; // ViT/vision positions (e.g., [patches,2] or [patches,3]) + uint32_t pos_ids_len; + const float *pixel_values; + struct KVCache **kv_caches; + const float *temperature; + const uint32_t *topk; + const float *topp; + uint32_t *output; + void *logits; + + // LLM 3D mRoPE positions and rope_section + const uint32_t *llm_pos_ids; // shape (3, ntok) flattened, or nullptr + uint32_t llm_pos_ids_len; // must be 3*ntok if provided + const uint32_t *rope_section; // shape (3,), or nullptr + uint32_t rope_section_len; // must be 3 if provided +}; + +struct InferState { + std::mutex mtx; + std::condition_variable cv_load, cv_start, cv_done; + bool loaded = false; + bool proceed = false; + bool exit_flag = false; +}; + +struct Qwen3VLModel { + Qwen3VLMeta meta; + infiniDevice_t device; + std::vector dev_ids; + std::vector dev_resources; + std::vector states; + std::vector threads; + InferRequest req; + + Qwen3VLModel(const Qwen3VLMeta *, const ModelWeights *); +}; diff --git a/src/models/qwen3_vl/qwen3_vl_weight.cpp b/src/models/qwen3_vl/qwen3_vl_weight.cpp new file mode 100644 index 00000000..49109424 --- /dev/null +++ b/src/models/qwen3_vl/qwen3_vl_weight.cpp @@ -0,0 +1,370 @@ +#include "infinicore.h" +#include "qwen3_vl.hpp" + +#include +#include + +inline std::shared_ptr getSinTable(size_t dctx, size_t dh, float theta, infiniDtype_t dtype) { + auto half_dh = dh / 4; // 2D MRoPE: table_dim = dhead / 4 + auto unit = dsize(dtype); + void *table = std::malloc(dctx * half_dh * unit); + + for (size_t i = 0; i < dctx; i++) { + for (size_t j = 0; j < half_dh; j++) { + float _sin = std::sin( + static_cast(i) / std::pow(theta, static_cast(j) / half_dh)); + + if (dtype == INFINI_DTYPE_F16) { + ((uint16_t *)table)[i * half_dh + j] = f32_to_f16(_sin); + } else if (dtype == INFINI_DTYPE_BF16) { + ((uint16_t *)table)[i * half_dh + j] = f32_to_bf16(_sin); + } else if (dtype == INFINI_DTYPE_F32) { + ((float *)table)[i * half_dh + j] = _sin; + } else { + std::cout << "Sin table unsupported dtype" << std::endl; + std::abort(); + } + } + } + auto shape = std::vector({dctx, half_dh}); + auto tensor = Tensor::weight(table, dtype, shape); + std::free(table); + return tensor; +} + +inline std::shared_ptr getCosTable(size_t dctx, size_t dh, float theta, infiniDtype_t dtype) { + auto half_dh = dh / 4; // 2D MRoPE: table_dim = dhead / 4 + auto unit = dsize(dtype); + void *table = std::malloc(dctx * half_dh * unit); + + for (size_t i = 0; i < dctx; i++) { + for (size_t j = 0; j < half_dh; j++) { + float _cos = std::cos( + static_cast(i) / std::pow(theta, static_cast(j) / half_dh)); + + if (dtype == INFINI_DTYPE_F16) { + ((uint16_t *)table)[i * half_dh + j] = f32_to_f16(_cos); + } else if (dtype == INFINI_DTYPE_BF16) { + ((uint16_t *)table)[i * half_dh + j] = f32_to_bf16(_cos); + } else if (dtype == INFINI_DTYPE_F32) { + ((float *)table)[i * half_dh + j] = _cos; + } else { + std::cout << "Cos table unsupported dtype" << std::endl; + std::abort(); + } + } + } + auto shape = std::vector({dctx, half_dh}); + auto tensor = Tensor::weight(table, dtype, shape); + std::free(table); + return tensor; +} + +inline std::shared_ptr getSinTable_llm(size_t dctx, size_t dh, float theta, infiniDtype_t dtype) { + auto half_dh = dh / 2; // 3dmrope sin/cos 和普通rope一样 + auto unit = dsize(dtype); + void *table = std::malloc(dctx * half_dh * unit); + + for (size_t i = 0; i < dctx; i++) { + for (size_t j = 0; j < half_dh; j++) { + float _sin = std::sin( + static_cast(i) / std::pow(theta, static_cast(j) / half_dh)); + + if (dtype == INFINI_DTYPE_F16) { + ((uint16_t *)table)[i * half_dh + j] = f32_to_f16(_sin); + } else if (dtype == INFINI_DTYPE_BF16) { + ((uint16_t *)table)[i * half_dh + j] = f32_to_bf16(_sin); + } else if (dtype == INFINI_DTYPE_F32) { + ((float *)table)[i * half_dh + j] = _sin; + } else { + std::cout << "Sin table unsupported dtype" << std::endl; + std::abort(); + } + } + } + auto shape = std::vector({dctx, half_dh}); + auto tensor = Tensor::weight(table, dtype, shape); + std::free(table); + return tensor; +} + +inline std::shared_ptr getCosTable_llm(size_t dctx, size_t dh, float theta, infiniDtype_t dtype) { + auto half_dh = dh / 2; // 3dmrope sin/cos 和普通rope一样 + auto unit = dsize(dtype); + void *table = std::malloc(dctx * half_dh * unit); + + for (size_t i = 0; i < dctx; i++) { + for (size_t j = 0; j < half_dh; j++) { + float _cos = std::cos( + static_cast(i) / std::pow(theta, static_cast(j) / half_dh)); + + if (dtype == INFINI_DTYPE_F16) { + ((uint16_t *)table)[i * half_dh + j] = f32_to_f16(_cos); + } else if (dtype == INFINI_DTYPE_BF16) { + ((uint16_t *)table)[i * half_dh + j] = f32_to_bf16(_cos); + } else if (dtype == INFINI_DTYPE_F32) { + ((float *)table)[i * half_dh + j] = _cos; + } else { + std::cout << "Cos table unsupported dtype" << std::endl; + std::abort(); + } + } + } + auto shape = std::vector({dctx, half_dh}); + auto tensor = Tensor::weight(table, dtype, shape); + std::free(table); + return tensor; +} + +namespace { + +inline void print_info(const Qwen3VLMeta &meta) { + + printf("\nQwen3VLMeta: \n"); + // common + printf(" dt_logits : %d\n", meta.dt_logits); + printf(" nlayer : %ld\n", meta.nlayer); + printf(" d : %ld\n", meta.d); + printf(" dctx : %ld\n", meta.dctx); + printf(" dvoc : %ld\n", meta.dvoc); + printf(" epsilon : %f\n", meta.epsilon); + printf(" end_token : %d\n", meta.end_token); + + // llm + printf(" nh : %ld\n", meta.nh); + printf(" nkvh : %ld\n", meta.nkvh); + printf(" dh : %ld\n", meta.dh); + printf(" theta : %f\n", meta.theta); + + // vision encoder + printf(" vision_hidden_size : %ld\n", meta.vision_hidden_size); + printf(" vision_layers : %ld\n", meta.vision_layers); + printf(" vision_heads : %ld\n", meta.vision_heads); + printf(" patch_size : %ld\n", meta.patch_size); + printf(" img_size : %ld\n", meta.img_size); + printf(" image_token_id : %d\n", meta.image_token_id); + printf(" video_token_id : %d\n", meta.video_token_id); + + printf("\n"); +} +}; // namespace + +// Qwen3VLMeta: +// dt_logits : 19 +// nlayer : 28 +// d : 2048 +// dctx : 1024 +// dvoc : 152064 +// epsilon : 0.000001 +// end_token : 151645 +// nh : 16 +// nkvh : 8 +// dh : 128 +// theta : 5000000.000000 +// vision_hidden_size : 768 +// vision_layers : 12 +// vision_heads : 12 +// patch_size : 16 +// img_size : 768 +// image_token_id : 151655 +// video_token_id : 151656 + +// "out_hidden_size": 2048, +// "intermediate_size": 3072, + +Qwen3VLWeights::Qwen3VLWeights( + const Qwen3VLMeta *meta, + infiniDevice_t device, + const std::vector &dev_ids) : infinicore::weights::Loader(device, dev_ids) { + auto ndev = dev_ids.size(); + _device_weights.resize(ndev); + infiniDtype_t dt_logits = meta->dt_logits; + infiniDtype_t dt_norm_w = meta->dt_norm_w; + size_t nlayer = meta->nlayer; + size_t d = meta->d; + size_t nh = meta->nh / ndev; + size_t nkvh = meta->nkvh / ndev; + size_t dh = meta->dh; + size_t di = meta->di / ndev; + size_t dctx = meta->dctx; + size_t dvoc = meta->dvoc; + + print_info(*meta); + + // Vision encoder parameters + size_t vision_hidden_size = meta->vision_hidden_size; + size_t vision_layers = meta->vision_layers; + size_t vision_heads = meta->vision_heads; + size_t patch_size = meta->patch_size; + + for (size_t i = 0; i < ndev; i++) { + RUN_INFINI(infinirtSetDevice(device, dev_ids[i])); + + auto weight = std::make_shared(); + _device_weights[i] = weight; + + auto w_in_embd = Tensor::weight(nullptr, dt_logits, {dvoc, d}); + this->register_weight("model.embed_tokens.weight", w_in_embd, i); + weight->w_in_embd = w_in_embd; + + auto w_out_norm = Tensor::weight(nullptr, dt_norm_w, {d}); + this->register_weight("model.norm.weight", w_out_norm, i); + weight->w_out_norm = w_out_norm; + + auto w_out_embd = Tensor::weight(nullptr, dt_logits, {dvoc, d})->permute({1, 0}); + this->register_weight("lm_head.weight", w_out_embd, i); + weight->w_out_embd = w_out_embd; + + weight->sin_table = getSinTable_llm(dctx, dh, meta->theta, dt_logits); + weight->cos_table = getCosTable_llm(dctx, dh, meta->theta, dt_logits); + + // 视觉 mRoPE 表(2D):按 vision head 维度构建 + size_t dh_v = vision_heads > 0 ? (vision_hidden_size / vision_heads) : vision_hidden_size; + weight->sin_table_v = getSinTable(dctx, dh_v, meta->theta, dt_logits); + weight->cos_table_v = getCosTable(dctx, dh_v, meta->theta, dt_logits); + +#define REGISTER_LAYER_WEIGHT_1D(W_NAME, W_VAR, W_DIM, W_DTYPE, W_DIST_TYPE) \ + auto W_VAR = Tensor::weight(nullptr, W_DTYPE, {W_DIM}); \ + this->register_weight(W_NAME, W_VAR, i, infinicore::weights::DistributionType::W_DIST_TYPE); \ + weight->W_VAR.push_back(W_VAR); + +#define REGISTER_LAYER_WEIGHT_2D(W_NAME, W_VAR, W_DIM_1, W_DIM_2, W_DTYPE, W_DIST_TYPE) \ + auto W_VAR = Tensor::weight(nullptr, W_DTYPE, {W_DIM_1, W_DIM_2}); \ + this->register_weight(W_NAME, W_VAR, i, infinicore::weights::DistributionType::W_DIST_TYPE); \ + weight->W_VAR.push_back(W_VAR); + + // auto W_VAR = Tensor::weight(nullptr, W_DTYPE, {W_DIM_2, W_DIM_1})->permute({1, 0}); + +#define REGISTER_LAYER_WEIGHT_5D(W_NAME, W_VAR, W_DIM_1, W_DIM_2, W_DIM_3, W_DIM_4, W_DIM_5, W_DTYPE, W_DIST_TYPE) \ + auto W_VAR = Tensor::weight(nullptr, W_DTYPE, {W_DIM_1, W_DIM_2, W_DIM_3, W_DIM_4, W_DIM_5}); \ + this->register_weight(W_NAME, W_VAR, i, infinicore::weights::DistributionType::W_DIST_TYPE); \ + weight->W_VAR.push_back(W_VAR); + +// merger 权重 +#define REGISTER_MERGER() \ + REGISTER_LAYER_WEIGHT_1D("visual.merger.ln_q.bias", b_v_merger_ln_q, vision_hidden_size, dt_norm_w, FULL); \ + REGISTER_LAYER_WEIGHT_1D("visual.merger.ln_q.weight", w_v_merger_ln_q, vision_hidden_size, dt_norm_w, FULL); \ + REGISTER_LAYER_WEIGHT_1D("visual.merger.mlp.0.bias", b_v_merger_mlp_0, 4 * vision_hidden_size, dt_logits, FULL); \ + REGISTER_LAYER_WEIGHT_2D("visual.merger.mlp.0.weight", w_v_merger_mlp_0, 4 * vision_hidden_size, vision_hidden_size, dt_logits, FULL); \ + REGISTER_LAYER_WEIGHT_1D("visual.merger.mlp.2.bias", b_v_merger_mlp_2, vision_hidden_size, dt_logits, FULL); \ + REGISTER_LAYER_WEIGHT_2D("visual.merger.mlp.2.weight", w_v_merger_mlp_2, vision_hidden_size, 4 * vision_hidden_size, dt_logits, FULL); + +// merger_list 权重 +#define REGISTER_MERGER_LIST(IDX) \ + REGISTER_LAYER_WEIGHT_1D("visual.merger_list." #IDX ".ln_q.bias", b_v_merger_list_##IDX##_ln_q, 4 * vision_hidden_size, dt_norm_w, FULL); \ + REGISTER_LAYER_WEIGHT_1D("visual.merger_list." #IDX ".ln_q.weight", w_v_merger_list_##IDX##_ln_q, 4 * vision_hidden_size, dt_norm_w, FULL); \ + REGISTER_LAYER_WEIGHT_1D("visual.merger_list." #IDX ".mlp.0.bias", b_v_merger_list_##IDX##_mlp_0, 4 * vision_hidden_size, dt_logits, COLUMN); \ + REGISTER_LAYER_WEIGHT_2D("visual.merger_list." #IDX ".mlp.0.weight", w_v_merger_list_##IDX##_mlp_0, 4 * vision_hidden_size, 4 * vision_hidden_size, dt_logits, COLUMN); \ + REGISTER_LAYER_WEIGHT_1D("visual.merger_list." #IDX ".mlp.2.bias", b_v_merger_list_##IDX##_mlp_2, d, dt_logits, COLUMN); \ + REGISTER_LAYER_WEIGHT_2D("visual.merger_list." #IDX ".mlp.2.weight", w_v_merger_list_##IDX##_mlp_2, d, 4 * vision_hidden_size, dt_logits, COLUMN); + + // patch embed和pos embed权重 + // Loading visual.patch_embed.proj.bias: torch.Size([768]) + // Loading visual.patch_embed.proj.weight: torch.Size([768, 3, 2, 16, 16]) + // Loading visual.pos_embed.weight: torch.Size([2304, 768]) + REGISTER_LAYER_WEIGHT_1D("visual.patch_embed.proj.bias", b_v_patch_embed_proj, vision_hidden_size, dt_logits, FULL); + REGISTER_LAYER_WEIGHT_5D("visual.patch_embed.proj.weight", w_v_patch_embed_proj, vision_hidden_size, 3, 2, patch_size, patch_size, dt_logits, FULL); + REGISTER_LAYER_WEIGHT_2D("visual.pos_embed.weight", w_v_pos_embed, 3 * vision_hidden_size, vision_hidden_size, dt_logits, FULL); + + // merger 和 merger_list 权重 + for (size_t merger_layer = 0; merger_layer < 4; merger_layer++) { + if (merger_layer == 0) { + // Loading visual.merger.ln_q.bias: torch.Size([768]) + // Loading visual.merger.ln_q.weight: torch.Size([768]) + // Loading visual.merger.mlp.0.bias: torch.Size([3072]) + // Loading visual.merger.mlp.0.weight: torch.Size([3072, 3072]) ??? + // Loading visual.merger.mlp.2.bias: torch.Size([2048]) + // Loading visual.merger.mlp.2.weight: torch.Size([2048, 3072]) + REGISTER_MERGER(); + } else { + size_t merge_idx = merger_layer - 1; + // Loading visual.merger_list.X.ln_q.bias: torch.Size([3072]) + // Loading visual.merger_list.X.ln_q.weight: torch.Size([3072]) + // Loading visual.merger_list.X.mlp.0.bias: torch.Size([3072]) + // Loading visual.merger_list.X.mlp.0.weight: torch.Size([3072, 3072]) + // Loading visual.merger_list.X.mlp.2.bias: torch.Size([2048]) + // Loading visual.merger_list.X.mlp.2.weight: torch.Size([2048, 3072]) + if (merge_idx == 0) { + REGISTER_MERGER_LIST(0); + } else if (merge_idx == 1) { + REGISTER_MERGER_LIST(1); + } else { + REGISTER_MERGER_LIST(2); + } + } + } + + for (size_t layer = 0; layer < nlayer; layer++) { + + // vision encoder + if (layer < vision_layers) { + // Loading visual.blocks.0.attn.proj.bias: torch.Size([768]) + // Loading visual.blocks.0.attn.proj.weight: torch.Size([768, 768]) + // Loading visual.blocks.0.attn.qkv.bias: torch.Size([2304]) + // Loading visual.blocks.0.attn.qkv.weight: torch.Size([2304, 768]) + // Loading visual.blocks.0.mlp.linear_fc1.bias: torch.Size([3072]) + // Loading visual.blocks.0.mlp.linear_fc1.weight: torch.Size([3072, 768]) + // Loading visual.blocks.0.mlp.linear_fc2.bias: torch.Size([768]) + // Loading visual.blocks.0.mlp.linear_fc2.weight: torch.Size([768, 3072]) + // Loading visual.blocks.0.norm1.bias: torch.Size([768]) + // Loading visual.blocks.0.norm1.weight: torch.Size([768]) + // Loading visual.blocks.0.norm2.bias: torch.Size([768]) + // Loading visual.blocks.0.norm2.weight: torch.Size([768]) + REGISTER_LAYER_WEIGHT_1D("visual.blocks." + std::to_string(layer) + ".norm1.bias", b_v_norm1, vision_hidden_size, dt_norm_w, FULL); + REGISTER_LAYER_WEIGHT_1D("visual.blocks." + std::to_string(layer) + ".norm1.weight", w_v_norm1, vision_hidden_size, dt_norm_w, FULL); + REGISTER_LAYER_WEIGHT_1D("visual.blocks." + std::to_string(layer) + ".attn.proj.bias", b_v_attn_proj, vision_hidden_size, dt_logits, FULL); + REGISTER_LAYER_WEIGHT_2D("visual.blocks." + std::to_string(layer) + ".attn.proj.weight", w_v_attn_proj, vision_hidden_size, vision_hidden_size, dt_logits, FULL); + REGISTER_LAYER_WEIGHT_1D("visual.blocks." + std::to_string(layer) + ".attn.qkv.bias", b_v_attn_qkv, 3 * vision_hidden_size, dt_logits, FULL); + REGISTER_LAYER_WEIGHT_2D("visual.blocks." + std::to_string(layer) + ".attn.qkv.weight", w_v_attn_qkv, 3 * vision_hidden_size, vision_hidden_size, dt_logits, FULL); + + REGISTER_LAYER_WEIGHT_1D("visual.blocks." + std::to_string(layer) + ".norm2.bias", b_v_norm2, vision_hidden_size, dt_norm_w, FULL); + REGISTER_LAYER_WEIGHT_1D("visual.blocks." + std::to_string(layer) + ".norm2.weight", w_v_norm2, vision_hidden_size, dt_norm_w, FULL); + REGISTER_LAYER_WEIGHT_1D("visual.blocks." + std::to_string(layer) + ".mlp.linear_fc1.bias", b_v_mlp_fc1, 4 * vision_hidden_size, dt_logits, FULL); + REGISTER_LAYER_WEIGHT_2D("visual.blocks." + std::to_string(layer) + ".mlp.linear_fc1.weight", w_v_mlp_fc1, 4 * vision_hidden_size, vision_hidden_size, dt_logits, FULL); + REGISTER_LAYER_WEIGHT_1D("visual.blocks." + std::to_string(layer) + ".mlp.linear_fc2.bias", b_v_mlp_fc2, vision_hidden_size, dt_logits, FULL); + REGISTER_LAYER_WEIGHT_2D("visual.blocks." + std::to_string(layer) + ".mlp.linear_fc2.weight", w_v_mlp_fc2, vision_hidden_size, 4 * vision_hidden_size, dt_logits, FULL); + } + + // llm + // Loading lm_head.weight: torch.Size([152064, 2048]) + // Loading model.embed_tokens.weight: torch.Size([152064, 2048]) + // Loading model.layers.0.input_layernorm.weight: torch.Size([2048]) + // Loading model.layers.0.mlp.down_proj.weight: torch.Size([2048, 6144]) + // Loading model.layers.0.mlp.gate_proj.weight: torch.Size([6144, 2048]) + // Loading model.layers.0.mlp.up_proj.weight: torch.Size([6144, 2048]) + // Loading model.layers.0.post_attention_layernorm.weight: torch.Size([2048]) + // Loading model.layers.0.self_attn.k_norm.weight: torch.Size([128]) + // Loading model.layers.0.self_attn.k_proj.weight: torch.Size([1024, 2048]) + // Loading model.layers.0.self_attn.o_proj.weight: torch.Size([2048, 2048]) + // Loading model.layers.0.self_attn.q_norm.weight: torch.Size([128]) + // Loading model.layers.0.self_attn.q_proj.weight: torch.Size([2048, 2048]) + // Loading model.layers.0.self_attn.v_proj.weight: torch.Size([1024, 2048]) + REGISTER_LAYER_WEIGHT_1D("model.layers." + std::to_string(layer) + ".input_layernorm.weight", w_attn_norm, d, dt_norm_w, FULL); + REGISTER_LAYER_WEIGHT_2D("model.layers." + std::to_string(layer) + ".self_attn.q_proj.weight", w_attn_q, d, nh * dh, dt_logits, COLUMN); + REGISTER_LAYER_WEIGHT_2D("model.layers." + std::to_string(layer) + ".self_attn.k_proj.weight", w_attn_k, d, nkvh * dh, dt_logits, COLUMN); + REGISTER_LAYER_WEIGHT_1D("model.layers." + std::to_string(layer) + ".self_attn.q_norm.weight", w_q_norm, nh * dh, dt_norm_w, FULL); + REGISTER_LAYER_WEIGHT_1D("model.layers." + std::to_string(layer) + ".self_attn.k_norm.weight", w_k_norm, nkvh * dh, dt_norm_w, FULL); + REGISTER_LAYER_WEIGHT_2D("model.layers." + std::to_string(layer) + ".self_attn.v_proj.weight", w_attn_v, d, nkvh * dh, dt_logits, COLUMN); + REGISTER_LAYER_WEIGHT_2D("model.layers." + std::to_string(layer) + ".self_attn.o_proj.weight", w_attn_out, nh * dh, d, dt_logits, ROW); + + REGISTER_LAYER_WEIGHT_1D("model.layers." + std::to_string(layer) + ".post_attention_layernorm.weight", w_ffn_norm, d, dt_norm_w, FULL); + REGISTER_LAYER_WEIGHT_2D("model.layers." + std::to_string(layer) + ".mlp.gate_proj.weight", w_ffn_gate, d, di, dt_logits, COLUMN); + REGISTER_LAYER_WEIGHT_2D("model.layers." + std::to_string(layer) + ".mlp.up_proj.weight", w_ffn_up, d, di, dt_logits, COLUMN); + REGISTER_LAYER_WEIGHT_2D("model.layers." + std::to_string(layer) + ".mlp.down_proj.weight", w_ffn_down, di, d, dt_logits, ROW); + } + } + +#undef REGISTER_LAYER_WEIGHT_1D +#undef REGISTER_LAYER_WEIGHT_2D +#undef REGISTER_MERGER +#undef REGISTER_MERGER_LIST +} + +__C struct ModelWeights * +createQwen3VLWeights(const Qwen3VLMeta *meta, + infiniDevice_t device, + int ndev, + const int *dev_ids) { + Qwen3VLWeights *weights = new Qwen3VLWeights(meta, device, std::vector(dev_ids, dev_ids + ndev)); + return (struct ModelWeights *)weights; +} From 95ac97c7671b12c9f9bc90ccc19e4ce6ba0fe11a Mon Sep 17 00:00:00 2001 From: cearX Date: Tue, 13 Jan 2026 16:09:11 +0800 Subject: [PATCH 2/3] fix weight --- qw3vl.py | 53 +++++++++ scripts/qwen3vl.py | 86 +++++--------- src/cache_manager/opcache_manager.hpp | 6 +- src/models/inference_context.cpp | 2 +- src/models/qwen3_vl/qwen3_vl.cpp | 5 +- src/models/qwen3_vl/qwen3_vl_weight.cpp | 144 +++++++++--------------- 6 files changed, 141 insertions(+), 155 deletions(-) create mode 100644 qw3vl.py diff --git a/qw3vl.py b/qw3vl.py new file mode 100644 index 00000000..589b9839 --- /dev/null +++ b/qw3vl.py @@ -0,0 +1,53 @@ +from transformers import Qwen3VLForConditionalGeneration, AutoProcessor + +# default: Load the model on the available device(s) +model = Qwen3VLForConditionalGeneration.from_pretrained( + "/home/cearx/qy/model/Qwen3-VL-2B-Instruct", dtype="auto", device_map="auto" +) + +# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios. +# model = Qwen3VLForConditionalGeneration.from_pretrained( +# "Qwen/Qwen3-VL-2B-Instruct", +# dtype=torch.bfloat16, +# attn_implementation="flash_attention_2", +# device_map="auto", +# ) + +processor = AutoProcessor.from_pretrained( + "/home/cearx/qy/model/Qwen3-VL-2B-Instruct") + +# url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" +url = "/home/cearx/CLIP/image3.jpg" + +messages = [ + { + "role": "user", + "content": [ + { + "type": "image", + "image": url, + }, + {"type": "text", "text": "Describe this image."}, + ], + } +] + +# Preparation for inference +inputs = processor.apply_chat_template( + messages, + tokenize=True, + add_generation_prompt=True, + return_dict=True, + return_tensors="pt" +) +inputs = inputs.to(model.device) + +# Inference: Generation of the output +generated_ids = model.generate(**inputs, max_new_tokens=128) +generated_ids_trimmed = [ + out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) +] +output_text = processor.batch_decode( + generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False +) +print(output_text) diff --git a/scripts/qwen3vl.py b/scripts/qwen3vl.py index fcefb769..77c41feb 100644 --- a/scripts/qwen3vl.py +++ b/scripts/qwen3vl.py @@ -499,32 +499,16 @@ def load_all_safetensors_from_dir(self, dir_path_: str): dir_path_ = Path(dir_path_) total_keys = 0 - # 创建检查文件夹 - check_dir = Path("./check") - check_dir.mkdir(exist_ok=True) - for file in sorted(dir_path_.glob("*.safetensors")): with safetensors.safe_open(file, framework="pt", device="cpu") as f: for key in f.keys(): total_keys += 1 - tensor = f.get_tensor(key) - - # 保存张量 - self.save_tensor(key, tensor) - - # if "o_proj.scales" in key: - # tensor = tensor * self.meta.scale_o - # elif "down_proj.scales" in key: - # tensor = tensor * self.meta.scale_down - # elif "embed_tokens.weight" in key: - # tensor = tensor * self.meta.scale_input - # elif "lm_head.weight" in key: - # tensor = tensor * self.meta.scale_output - self.qwen3vl_model.load_weight( self.weights, key, tensor.data_ptr() ) + # 保存关键张量用于比对 + self.save_tensor(key, tensor) print(f"加载的张量 key 总数: {total_keys}") def save_tensor(self, key: str, tensor, check_dir=None): @@ -534,81 +518,67 @@ def save_tensor(self, key: str, tensor, check_dir=None): # 创建保存目录 check_dir.mkdir(exist_ok=True) - # 根据键名生成文件名 + # 根据键名生成文件名 (以 C++ 的键名为准) filename = None # 1. Patch Embedding - if key == "visual.patch_embed.proj.weight": + if key == "model.visual.patch_embed.proj.weight": filename = "1.patch_embd_w.txt" - elif key == "visual.patch_embed.proj.bias": + elif key == "model.visual.patch_embed.proj.bias": filename = "1.patch_embd_bias.txt" # 2. Position Embedding - elif key == "visual.pos_embed.weight": + elif key == "model.visual.pos_embed.weight": filename = "2.pos_embd.txt" # 3. Block0 相关张量 - elif key == "visual.blocks.0.norm1.weight": + elif key == "model.visual.blocks.0.norm1.weight": filename = "3.block0.norm1_w.txt" - elif key == "visual.blocks.0.norm1.bias": + elif key == "model.visual.blocks.0.norm1.bias": filename = "3.block0.norm1.bias.txt" - elif key == "visual.blocks.0.attn.qkv.weight": + elif key == "model.visual.blocks.0.attn.qkv.weight": filename = "3.block0.attn.qkv_w.txt" - elif key == "visual.blocks.0.attn.qkv.bias": + elif key == "model.visual.blocks.0.attn.qkv.bias": filename = "3.block0.attn.qkv.bias.txt" - elif key == "visual.blocks.0.attn.proj.weight": + elif key == "model.visual.blocks.0.attn.proj.weight": filename = "3.block0.attn.proj_w.txt" - elif key == "visual.blocks.0.attn.proj.bias": + elif key == "model.visual.blocks.0.attn.proj.bias": filename = "3.block0.attn.proj.bias.txt" - elif key == "visual.blocks.0.norm2.weight": + elif key == "model.visual.blocks.0.norm2.weight": filename = "4.block0.norm2_w.txt" - elif key == "visual.blocks.0.norm2.bias": + elif key == "model.visual.blocks.0.norm2.bias": filename = "4.block0.norm2.bias.txt" - elif key == "visual.blocks.0.mlp.linear_fc1.weight": + elif key == "model.visual.blocks.0.mlp.linear_fc1.weight": filename = "4.block0.mlp.fc1_w.txt" - elif key == "visual.blocks.0.mlp.linear_fc1.bias": + elif key == "model.visual.blocks.0.mlp.linear_fc1.bias": filename = "4.block0.mlp.fc1.bias.txt" - elif key == "visual.blocks.0.mlp.linear_fc2.weight": + elif key == "model.visual.blocks.0.mlp.linear_fc2.weight": filename = "4.block0.mlp.fc2_w.txt" - elif key == "visual.blocks.0.mlp.linear_fc2.bias": + elif key == "model.visual.blocks.0.mlp.linear_fc2.bias": filename = "4.block0.mlp.fc2.bias.txt" # 5. Merger - elif key == "visual.merger.norm.weight": + elif key == "model.visual.merger.norm.weight": filename = "5.merger.norm_w.txt" - elif key == "visual.merger.norm.bias": + elif key == "model.visual.merger.norm.bias": filename = "5.merger.norm.bias.txt" - elif key == "visual.merger.linear_fc1.weight": + elif key == "model.visual.merger.linear_fc1.weight": filename = "5.merger.fc1_w.txt" - elif key == "visual.merger.linear_fc1.bias": + elif key == "model.visual.merger.linear_fc1.bias": filename = "5.merger.fc1.bias.txt" - elif key == "visual.merger.linear_fc2.weight": + elif key == "model.visual.merger.linear_fc2.weight": filename = "5.merger.fc2_w.txt" - elif key == "visual.merger.linear_fc2.bias": + elif key == "model.visual.merger.linear_fc2.bias": filename = "5.merger.fc2.bias.txt" - # 兼容原有的merger键名 - elif key == "visual.merger.ln_q.weight": - filename = "5.merger.ln_q_w.txt" - elif key == "visual.merger.ln_q.bias": - filename = "5.merger.ln_q_bias.txt" - elif key == "visual.merger.mlp.0.weight": - filename = "5.merger.mlp0_w.txt" - elif key == "visual.merger.mlp.0.bias": - filename = "5.merger.mlp0_bias.txt" - elif key == "visual.merger.mlp.2.weight": - filename = "5.merger.mlp2_w.txt" - elif key == "visual.merger.mlp.2.bias": - filename = "5.merger.mlp2_bias.txt" - # 6. Deepstack Merger List (动态匹配) - elif "visual.deepstack_merger_list." in key: + elif "model.visual.deepstack_merger_list." in key: # 提取索引号 parts = key.split(".") - if len(parts) >= 3: + if len(parts) >= 4: try: - idx = int(parts[2]) # deepstack_merger_list.{idx}.xxx - suffix = ".".join(parts[3:]) + idx = int(parts[3]) # model.visual.deepstack_merger_list.{idx}.xxx + suffix = ".".join(parts[4:]) prefix = f"6.deepstack{idx}" if suffix == "norm.weight": diff --git a/src/cache_manager/opcache_manager.hpp b/src/cache_manager/opcache_manager.hpp index 96e9df05..66587352 100644 --- a/src/cache_manager/opcache_manager.hpp +++ b/src/cache_manager/opcache_manager.hpp @@ -9,6 +9,8 @@ #include "../tensor.hpp" #include "../utils.hpp" #include "infinicore_infer.h" +#include "infiniop/ops/2dmrope.h" +#include "infiniop/ops/3dmrope.h" #include "infiniop/ops/conv.h" #include "infiniop/ops/gelu.h" #include "infiniop/ops/layer_norm.h" @@ -186,9 +188,9 @@ class CacheManager { Topkrouter_cache(capacity, DESTROY_FUNC(Topkrouter)), SwiGLU_cache(capacity, DESTROY_FUNC(SwiGLU)), RandomSample_cache(capacity, DESTROY_FUNC(RandomSample)), + DequantizeAWQ_cache(capacity, DESTROY_FUNC(DequantizeAWQ)), Conv_cache(capacity, DESTROY_FUNC(Conv)), - Gelu_cache(capacity, DESTROY_FUNC(Gelu)), - DequantizeAWQ_cache(capacity, DESTROY_FUNC(DequantizeAWQ)) {} + Gelu_cache(capacity, DESTROY_FUNC(Gelu)) {} template static size_t createDescriptorKey(Tensors... tensors) { diff --git a/src/models/inference_context.cpp b/src/models/inference_context.cpp index 18562cff..0e074ab0 100644 --- a/src/models/inference_context.cpp +++ b/src/models/inference_context.cpp @@ -216,7 +216,7 @@ void InferenceContext::softmax(std::shared_ptr y, infiniopSoftmaxDescriptor_t desc; if (!cache_manager->getSoftmaxDescriptor(key, desc)) { RUN_INFINI(infiniopCreateSoftmaxDescriptor( - op_handle, &desc, y->desc(), x->desc())); + op_handle, &desc, y->desc(), x->desc(), -1)); cache_manager->putSoftmaxDescriptor(key, desc); } diff --git a/src/models/qwen3_vl/qwen3_vl.cpp b/src/models/qwen3_vl/qwen3_vl.cpp index 95e9c101..c6915a4c 100644 --- a/src/models/qwen3_vl/qwen3_vl.cpp +++ b/src/models/qwen3_vl/qwen3_vl.cpp @@ -538,8 +538,9 @@ void inferDeviceBatch(const Qwen3VLMeta *meta, DeviceResource &rsrc, const float *temperature, const uint32_t *topk, const float *topp, uint32_t *output, void *last_logits) { // DEBUG: 推理开始 - // printf("[DEBUG] Qwen3VL inferDeviceBatch START: idev=%u, ntok=%u, nreq=%u, has_vision=%s\n", - // idev, ntok, nreq, (pixel_values != nullptr) ? "true" : "false"); + printf("[DEBUG] Qwen3VL inferDeviceBatch START: idev=%u, ntok=%u, nreq=%u, has_vision=%s\n", + idev, ntok, nreq, (pixel_values != nullptr) ? "true" : "false"); + exit(0); auto nlayer = meta->nlayer; auto nkvh = meta->nkvh / ndev; auto nh = meta->nh / ndev; diff --git a/src/models/qwen3_vl/qwen3_vl_weight.cpp b/src/models/qwen3_vl/qwen3_vl_weight.cpp index 49109424..1344e666 100644 --- a/src/models/qwen3_vl/qwen3_vl_weight.cpp +++ b/src/models/qwen3_vl/qwen3_vl_weight.cpp @@ -204,15 +204,15 @@ Qwen3VLWeights::Qwen3VLWeights( _device_weights[i] = weight; auto w_in_embd = Tensor::weight(nullptr, dt_logits, {dvoc, d}); - this->register_weight("model.embed_tokens.weight", w_in_embd, i); + this->register_weight("model.language_model.embed_tokens.weight", w_in_embd, i); weight->w_in_embd = w_in_embd; auto w_out_norm = Tensor::weight(nullptr, dt_norm_w, {d}); - this->register_weight("model.norm.weight", w_out_norm, i); + this->register_weight("model.language_model.norm.weight", w_out_norm, i); weight->w_out_norm = w_out_norm; auto w_out_embd = Tensor::weight(nullptr, dt_logits, {dvoc, d})->permute({1, 0}); - this->register_weight("lm_head.weight", w_out_embd, i); + this->register_weight("model.lm_head.weight", w_out_embd, i); weight->w_out_embd = w_out_embd; weight->sin_table = getSinTable_llm(dctx, dh, meta->theta, dt_logits); @@ -240,50 +240,35 @@ Qwen3VLWeights::Qwen3VLWeights( this->register_weight(W_NAME, W_VAR, i, infinicore::weights::DistributionType::W_DIST_TYPE); \ weight->W_VAR.push_back(W_VAR); -// merger 权重 -#define REGISTER_MERGER() \ - REGISTER_LAYER_WEIGHT_1D("visual.merger.ln_q.bias", b_v_merger_ln_q, vision_hidden_size, dt_norm_w, FULL); \ - REGISTER_LAYER_WEIGHT_1D("visual.merger.ln_q.weight", w_v_merger_ln_q, vision_hidden_size, dt_norm_w, FULL); \ - REGISTER_LAYER_WEIGHT_1D("visual.merger.mlp.0.bias", b_v_merger_mlp_0, 4 * vision_hidden_size, dt_logits, FULL); \ - REGISTER_LAYER_WEIGHT_2D("visual.merger.mlp.0.weight", w_v_merger_mlp_0, 4 * vision_hidden_size, vision_hidden_size, dt_logits, FULL); \ - REGISTER_LAYER_WEIGHT_1D("visual.merger.mlp.2.bias", b_v_merger_mlp_2, vision_hidden_size, dt_logits, FULL); \ - REGISTER_LAYER_WEIGHT_2D("visual.merger.mlp.2.weight", w_v_merger_mlp_2, vision_hidden_size, 4 * vision_hidden_size, dt_logits, FULL); - -// merger_list 权重 -#define REGISTER_MERGER_LIST(IDX) \ - REGISTER_LAYER_WEIGHT_1D("visual.merger_list." #IDX ".ln_q.bias", b_v_merger_list_##IDX##_ln_q, 4 * vision_hidden_size, dt_norm_w, FULL); \ - REGISTER_LAYER_WEIGHT_1D("visual.merger_list." #IDX ".ln_q.weight", w_v_merger_list_##IDX##_ln_q, 4 * vision_hidden_size, dt_norm_w, FULL); \ - REGISTER_LAYER_WEIGHT_1D("visual.merger_list." #IDX ".mlp.0.bias", b_v_merger_list_##IDX##_mlp_0, 4 * vision_hidden_size, dt_logits, COLUMN); \ - REGISTER_LAYER_WEIGHT_2D("visual.merger_list." #IDX ".mlp.0.weight", w_v_merger_list_##IDX##_mlp_0, 4 * vision_hidden_size, 4 * vision_hidden_size, dt_logits, COLUMN); \ - REGISTER_LAYER_WEIGHT_1D("visual.merger_list." #IDX ".mlp.2.bias", b_v_merger_list_##IDX##_mlp_2, d, dt_logits, COLUMN); \ - REGISTER_LAYER_WEIGHT_2D("visual.merger_list." #IDX ".mlp.2.weight", w_v_merger_list_##IDX##_mlp_2, d, 4 * vision_hidden_size, dt_logits, COLUMN); +// merger 权重:norm(1024) -> fc1(4096,4096) -> fc2(4096,2048) +#define REGISTER_MERGER() \ + REGISTER_LAYER_WEIGHT_1D("model.visual.merger.norm.bias", b_v_merger_ln_q, vision_hidden_size, dt_norm_w, FULL); \ + REGISTER_LAYER_WEIGHT_1D("model.visual.merger.norm.weight", w_v_merger_ln_q, vision_hidden_size, dt_norm_w, FULL); \ + REGISTER_LAYER_WEIGHT_1D("model.visual.merger.linear_fc1.bias", b_v_merger_mlp_0, 4 * vision_hidden_size, dt_logits, FULL); \ + REGISTER_LAYER_WEIGHT_2D("model.visual.merger.linear_fc1.weight", w_v_merger_mlp_0, 4 * vision_hidden_size, 4 * vision_hidden_size, dt_logits, FULL); \ + REGISTER_LAYER_WEIGHT_1D("model.visual.merger.linear_fc2.bias", b_v_merger_mlp_2, d, dt_logits, FULL); \ + REGISTER_LAYER_WEIGHT_2D("model.visual.merger.linear_fc2.weight", w_v_merger_mlp_2, d, 4 * vision_hidden_size, dt_logits, FULL); + +// merger_list 权重(实际是 deepstack_merger_list) +#define REGISTER_MERGER_LIST(IDX) \ + REGISTER_LAYER_WEIGHT_1D("model.visual.deepstack_merger_list." #IDX ".norm.bias", b_v_merger_list_##IDX##_ln_q, 4 * vision_hidden_size, dt_norm_w, FULL); \ + REGISTER_LAYER_WEIGHT_1D("model.visual.deepstack_merger_list." #IDX ".norm.weight", w_v_merger_list_##IDX##_ln_q, 4 * vision_hidden_size, dt_norm_w, FULL); \ + REGISTER_LAYER_WEIGHT_1D("model.visual.deepstack_merger_list." #IDX ".linear_fc1.bias", b_v_merger_list_##IDX##_mlp_0, 4 * vision_hidden_size, dt_logits, COLUMN); \ + REGISTER_LAYER_WEIGHT_2D("model.visual.deepstack_merger_list." #IDX ".linear_fc1.weight", w_v_merger_list_##IDX##_mlp_0, 4 * vision_hidden_size, 4 * vision_hidden_size, dt_logits, COLUMN); \ + REGISTER_LAYER_WEIGHT_1D("model.visual.deepstack_merger_list." #IDX ".linear_fc2.bias", b_v_merger_list_##IDX##_mlp_2, d, dt_logits, COLUMN); \ + REGISTER_LAYER_WEIGHT_2D("model.visual.deepstack_merger_list." #IDX ".linear_fc2.weight", w_v_merger_list_##IDX##_mlp_2, d, 4 * vision_hidden_size, dt_logits, COLUMN); // patch embed和pos embed权重 - // Loading visual.patch_embed.proj.bias: torch.Size([768]) - // Loading visual.patch_embed.proj.weight: torch.Size([768, 3, 2, 16, 16]) - // Loading visual.pos_embed.weight: torch.Size([2304, 768]) - REGISTER_LAYER_WEIGHT_1D("visual.patch_embed.proj.bias", b_v_patch_embed_proj, vision_hidden_size, dt_logits, FULL); - REGISTER_LAYER_WEIGHT_5D("visual.patch_embed.proj.weight", w_v_patch_embed_proj, vision_hidden_size, 3, 2, patch_size, patch_size, dt_logits, FULL); - REGISTER_LAYER_WEIGHT_2D("visual.pos_embed.weight", w_v_pos_embed, 3 * vision_hidden_size, vision_hidden_size, dt_logits, FULL); - - // merger 和 merger_list 权重 + REGISTER_LAYER_WEIGHT_1D("model.visual.patch_embed.proj.bias", b_v_patch_embed_proj, vision_hidden_size, dt_logits, FULL); + REGISTER_LAYER_WEIGHT_5D("model.visual.patch_embed.proj.weight", w_v_patch_embed_proj, vision_hidden_size, 3, 2, patch_size, patch_size, dt_logits, FULL); + REGISTER_LAYER_WEIGHT_2D("model.visual.pos_embed.weight", w_v_pos_embed, 2304, vision_hidden_size, dt_logits, FULL); + + // merger 和 deepstack_merger_list 权重 for (size_t merger_layer = 0; merger_layer < 4; merger_layer++) { if (merger_layer == 0) { - // Loading visual.merger.ln_q.bias: torch.Size([768]) - // Loading visual.merger.ln_q.weight: torch.Size([768]) - // Loading visual.merger.mlp.0.bias: torch.Size([3072]) - // Loading visual.merger.mlp.0.weight: torch.Size([3072, 3072]) ??? - // Loading visual.merger.mlp.2.bias: torch.Size([2048]) - // Loading visual.merger.mlp.2.weight: torch.Size([2048, 3072]) REGISTER_MERGER(); } else { size_t merge_idx = merger_layer - 1; - // Loading visual.merger_list.X.ln_q.bias: torch.Size([3072]) - // Loading visual.merger_list.X.ln_q.weight: torch.Size([3072]) - // Loading visual.merger_list.X.mlp.0.bias: torch.Size([3072]) - // Loading visual.merger_list.X.mlp.0.weight: torch.Size([3072, 3072]) - // Loading visual.merger_list.X.mlp.2.bias: torch.Size([2048]) - // Loading visual.merger_list.X.mlp.2.weight: torch.Size([2048, 3072]) if (merge_idx == 0) { REGISTER_MERGER_LIST(0); } else if (merge_idx == 1) { @@ -296,61 +281,36 @@ Qwen3VLWeights::Qwen3VLWeights( for (size_t layer = 0; layer < nlayer; layer++) { - // vision encoder + // vision encoder blocks if (layer < vision_layers) { - // Loading visual.blocks.0.attn.proj.bias: torch.Size([768]) - // Loading visual.blocks.0.attn.proj.weight: torch.Size([768, 768]) - // Loading visual.blocks.0.attn.qkv.bias: torch.Size([2304]) - // Loading visual.blocks.0.attn.qkv.weight: torch.Size([2304, 768]) - // Loading visual.blocks.0.mlp.linear_fc1.bias: torch.Size([3072]) - // Loading visual.blocks.0.mlp.linear_fc1.weight: torch.Size([3072, 768]) - // Loading visual.blocks.0.mlp.linear_fc2.bias: torch.Size([768]) - // Loading visual.blocks.0.mlp.linear_fc2.weight: torch.Size([768, 3072]) - // Loading visual.blocks.0.norm1.bias: torch.Size([768]) - // Loading visual.blocks.0.norm1.weight: torch.Size([768]) - // Loading visual.blocks.0.norm2.bias: torch.Size([768]) - // Loading visual.blocks.0.norm2.weight: torch.Size([768]) - REGISTER_LAYER_WEIGHT_1D("visual.blocks." + std::to_string(layer) + ".norm1.bias", b_v_norm1, vision_hidden_size, dt_norm_w, FULL); - REGISTER_LAYER_WEIGHT_1D("visual.blocks." + std::to_string(layer) + ".norm1.weight", w_v_norm1, vision_hidden_size, dt_norm_w, FULL); - REGISTER_LAYER_WEIGHT_1D("visual.blocks." + std::to_string(layer) + ".attn.proj.bias", b_v_attn_proj, vision_hidden_size, dt_logits, FULL); - REGISTER_LAYER_WEIGHT_2D("visual.blocks." + std::to_string(layer) + ".attn.proj.weight", w_v_attn_proj, vision_hidden_size, vision_hidden_size, dt_logits, FULL); - REGISTER_LAYER_WEIGHT_1D("visual.blocks." + std::to_string(layer) + ".attn.qkv.bias", b_v_attn_qkv, 3 * vision_hidden_size, dt_logits, FULL); - REGISTER_LAYER_WEIGHT_2D("visual.blocks." + std::to_string(layer) + ".attn.qkv.weight", w_v_attn_qkv, 3 * vision_hidden_size, vision_hidden_size, dt_logits, FULL); - - REGISTER_LAYER_WEIGHT_1D("visual.blocks." + std::to_string(layer) + ".norm2.bias", b_v_norm2, vision_hidden_size, dt_norm_w, FULL); - REGISTER_LAYER_WEIGHT_1D("visual.blocks." + std::to_string(layer) + ".norm2.weight", w_v_norm2, vision_hidden_size, dt_norm_w, FULL); - REGISTER_LAYER_WEIGHT_1D("visual.blocks." + std::to_string(layer) + ".mlp.linear_fc1.bias", b_v_mlp_fc1, 4 * vision_hidden_size, dt_logits, FULL); - REGISTER_LAYER_WEIGHT_2D("visual.blocks." + std::to_string(layer) + ".mlp.linear_fc1.weight", w_v_mlp_fc1, 4 * vision_hidden_size, vision_hidden_size, dt_logits, FULL); - REGISTER_LAYER_WEIGHT_1D("visual.blocks." + std::to_string(layer) + ".mlp.linear_fc2.bias", b_v_mlp_fc2, vision_hidden_size, dt_logits, FULL); - REGISTER_LAYER_WEIGHT_2D("visual.blocks." + std::to_string(layer) + ".mlp.linear_fc2.weight", w_v_mlp_fc2, vision_hidden_size, 4 * vision_hidden_size, dt_logits, FULL); + REGISTER_LAYER_WEIGHT_1D("model.visual.blocks." + std::to_string(layer) + ".norm1.bias", b_v_norm1, vision_hidden_size, dt_norm_w, FULL); + REGISTER_LAYER_WEIGHT_1D("model.visual.blocks." + std::to_string(layer) + ".norm1.weight", w_v_norm1, vision_hidden_size, dt_norm_w, FULL); + REGISTER_LAYER_WEIGHT_1D("model.visual.blocks." + std::to_string(layer) + ".attn.proj.bias", b_v_attn_proj, vision_hidden_size, dt_logits, FULL); + REGISTER_LAYER_WEIGHT_2D("model.visual.blocks." + std::to_string(layer) + ".attn.proj.weight", w_v_attn_proj, vision_hidden_size, vision_hidden_size, dt_logits, FULL); + REGISTER_LAYER_WEIGHT_1D("model.visual.blocks." + std::to_string(layer) + ".attn.qkv.bias", b_v_attn_qkv, 3 * vision_hidden_size, dt_logits, FULL); + REGISTER_LAYER_WEIGHT_2D("model.visual.blocks." + std::to_string(layer) + ".attn.qkv.weight", w_v_attn_qkv, 3 * vision_hidden_size, vision_hidden_size, dt_logits, FULL); + + REGISTER_LAYER_WEIGHT_1D("model.visual.blocks." + std::to_string(layer) + ".norm2.bias", b_v_norm2, vision_hidden_size, dt_norm_w, FULL); + REGISTER_LAYER_WEIGHT_1D("model.visual.blocks." + std::to_string(layer) + ".norm2.weight", w_v_norm2, vision_hidden_size, dt_norm_w, FULL); + REGISTER_LAYER_WEIGHT_1D("model.visual.blocks." + std::to_string(layer) + ".mlp.linear_fc1.bias", b_v_mlp_fc1, 4 * vision_hidden_size, dt_logits, FULL); + REGISTER_LAYER_WEIGHT_2D("model.visual.blocks." + std::to_string(layer) + ".mlp.linear_fc1.weight", w_v_mlp_fc1, 4 * vision_hidden_size, vision_hidden_size, dt_logits, FULL); + REGISTER_LAYER_WEIGHT_1D("model.visual.blocks." + std::to_string(layer) + ".mlp.linear_fc2.bias", b_v_mlp_fc2, vision_hidden_size, dt_logits, FULL); + REGISTER_LAYER_WEIGHT_2D("model.visual.blocks." + std::to_string(layer) + ".mlp.linear_fc2.weight", w_v_mlp_fc2, vision_hidden_size, 4 * vision_hidden_size, dt_logits, FULL); } - // llm - // Loading lm_head.weight: torch.Size([152064, 2048]) - // Loading model.embed_tokens.weight: torch.Size([152064, 2048]) - // Loading model.layers.0.input_layernorm.weight: torch.Size([2048]) - // Loading model.layers.0.mlp.down_proj.weight: torch.Size([2048, 6144]) - // Loading model.layers.0.mlp.gate_proj.weight: torch.Size([6144, 2048]) - // Loading model.layers.0.mlp.up_proj.weight: torch.Size([6144, 2048]) - // Loading model.layers.0.post_attention_layernorm.weight: torch.Size([2048]) - // Loading model.layers.0.self_attn.k_norm.weight: torch.Size([128]) - // Loading model.layers.0.self_attn.k_proj.weight: torch.Size([1024, 2048]) - // Loading model.layers.0.self_attn.o_proj.weight: torch.Size([2048, 2048]) - // Loading model.layers.0.self_attn.q_norm.weight: torch.Size([128]) - // Loading model.layers.0.self_attn.q_proj.weight: torch.Size([2048, 2048]) - // Loading model.layers.0.self_attn.v_proj.weight: torch.Size([1024, 2048]) - REGISTER_LAYER_WEIGHT_1D("model.layers." + std::to_string(layer) + ".input_layernorm.weight", w_attn_norm, d, dt_norm_w, FULL); - REGISTER_LAYER_WEIGHT_2D("model.layers." + std::to_string(layer) + ".self_attn.q_proj.weight", w_attn_q, d, nh * dh, dt_logits, COLUMN); - REGISTER_LAYER_WEIGHT_2D("model.layers." + std::to_string(layer) + ".self_attn.k_proj.weight", w_attn_k, d, nkvh * dh, dt_logits, COLUMN); - REGISTER_LAYER_WEIGHT_1D("model.layers." + std::to_string(layer) + ".self_attn.q_norm.weight", w_q_norm, nh * dh, dt_norm_w, FULL); - REGISTER_LAYER_WEIGHT_1D("model.layers." + std::to_string(layer) + ".self_attn.k_norm.weight", w_k_norm, nkvh * dh, dt_norm_w, FULL); - REGISTER_LAYER_WEIGHT_2D("model.layers." + std::to_string(layer) + ".self_attn.v_proj.weight", w_attn_v, d, nkvh * dh, dt_logits, COLUMN); - REGISTER_LAYER_WEIGHT_2D("model.layers." + std::to_string(layer) + ".self_attn.o_proj.weight", w_attn_out, nh * dh, d, dt_logits, ROW); - - REGISTER_LAYER_WEIGHT_1D("model.layers." + std::to_string(layer) + ".post_attention_layernorm.weight", w_ffn_norm, d, dt_norm_w, FULL); - REGISTER_LAYER_WEIGHT_2D("model.layers." + std::to_string(layer) + ".mlp.gate_proj.weight", w_ffn_gate, d, di, dt_logits, COLUMN); - REGISTER_LAYER_WEIGHT_2D("model.layers." + std::to_string(layer) + ".mlp.up_proj.weight", w_ffn_up, d, di, dt_logits, COLUMN); - REGISTER_LAYER_WEIGHT_2D("model.layers." + std::to_string(layer) + ".mlp.down_proj.weight", w_ffn_down, di, d, dt_logits, ROW); + // language model layers + REGISTER_LAYER_WEIGHT_1D("model.language_model.layers." + std::to_string(layer) + ".input_layernorm.weight", w_attn_norm, d, dt_norm_w, FULL); + REGISTER_LAYER_WEIGHT_2D("model.language_model.layers." + std::to_string(layer) + ".self_attn.q_proj.weight", w_attn_q, d, nh * dh, dt_logits, COLUMN); + REGISTER_LAYER_WEIGHT_2D("model.language_model.layers." + std::to_string(layer) + ".self_attn.k_proj.weight", w_attn_k, d, nkvh * dh, dt_logits, COLUMN); + REGISTER_LAYER_WEIGHT_1D("model.language_model.layers." + std::to_string(layer) + ".self_attn.q_norm.weight", w_q_norm, nh * dh, dt_norm_w, FULL); + REGISTER_LAYER_WEIGHT_1D("model.language_model.layers." + std::to_string(layer) + ".self_attn.k_norm.weight", w_k_norm, nkvh * dh, dt_norm_w, FULL); + REGISTER_LAYER_WEIGHT_2D("model.language_model.layers." + std::to_string(layer) + ".self_attn.v_proj.weight", w_attn_v, d, nkvh * dh, dt_logits, COLUMN); + REGISTER_LAYER_WEIGHT_2D("model.language_model.layers." + std::to_string(layer) + ".self_attn.o_proj.weight", w_attn_out, nh * dh, d, dt_logits, ROW); + + REGISTER_LAYER_WEIGHT_1D("model.language_model.layers." + std::to_string(layer) + ".post_attention_layernorm.weight", w_ffn_norm, d, dt_norm_w, FULL); + REGISTER_LAYER_WEIGHT_2D("model.language_model.layers." + std::to_string(layer) + ".mlp.gate_proj.weight", w_ffn_gate, d, di, dt_logits, COLUMN); + REGISTER_LAYER_WEIGHT_2D("model.language_model.layers." + std::to_string(layer) + ".mlp.up_proj.weight", w_ffn_up, d, di, dt_logits, COLUMN); + REGISTER_LAYER_WEIGHT_2D("model.language_model.layers." + std::to_string(layer) + ".mlp.down_proj.weight", w_ffn_down, di, d, dt_logits, ROW); } } From a0743963a96e9aa8ff13e8aa14083767b4dc0a1d Mon Sep 17 00:00:00 2001 From: cearX Date: Wed, 14 Jan 2026 19:57:51 +0800 Subject: [PATCH 3/3] fix debug_print_data_bf16 --- .gitignore | 4 + scripts/qwen3vl.py | 10 ++ src/models/qwen3_vl/qwen3_vl.cpp | 126 ++++++++++++++++++++++++ src/models/qwen3_vl/qwen3_vl_weight.cpp | 10 +- src/tensor/tensor.cpp | 2 +- 5 files changed, 145 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 767db187..5410fdfc 100644 --- a/.gitignore +++ b/.gitignore @@ -28,4 +28,8 @@ __pycache__/ # txt *.txt +# check +check_cpp/ +check/ + *.http diff --git a/scripts/qwen3vl.py b/scripts/qwen3vl.py index 77c41feb..d389b23a 100644 --- a/scripts/qwen3vl.py +++ b/scripts/qwen3vl.py @@ -530,6 +530,16 @@ def save_tensor(self, key: str, tensor, check_dir=None): # 2. Position Embedding elif key == "model.visual.pos_embed.weight": filename = "2.pos_embd.txt" + # # 保存二进制数据用于调试 (bfloat16需要用view转换) + # bin_path = check_dir / "2.pos_embd.bin" + # with open(bin_path, 'wb') as bf: + # # bfloat16转为uint16再保存 + # tensor_bytes = tensor.view(torch.uint16).numpy().tobytes() + # bf.write(tensor_bytes) + # print(f"[DEBUG] Saved pos_embd binary to {bin_path}, size: {len(tensor_bytes)} bytes") + # # 打印前10个值的hex(20字节,每个bfloat16是2字节) + # first_bytes = tensor.flatten()[:10].view(torch.uint16).numpy().tobytes() + # print(f"[DEBUG] First 10 values (hex): {first_bytes.hex()}") # 3. Block0 相关张量 elif key == "model.visual.blocks.0.norm1.weight": diff --git a/src/models/qwen3_vl/qwen3_vl.cpp b/src/models/qwen3_vl/qwen3_vl.cpp index c6915a4c..f2ccfa4d 100644 --- a/src/models/qwen3_vl/qwen3_vl.cpp +++ b/src/models/qwen3_vl/qwen3_vl.cpp @@ -540,6 +540,132 @@ void inferDeviceBatch(const Qwen3VLMeta *meta, DeviceResource &rsrc, // DEBUG: 推理开始 printf("[DEBUG] Qwen3VL inferDeviceBatch START: idev=%u, ntok=%u, nreq=%u, has_vision=%s\n", idev, ntok, nreq, (pixel_values != nullptr) ? "true" : "false"); + + // 打印关键权重张量用于比对 + auto &w = rsrc.weights; + + // // 保存二进制数据用于调试 + // w->w_v_pos_embed[0]->debug("check_cpp/2.pos_embd_cpp.bin"); + // printf("[DEBUG] Saved pos_embd binary from GPU\n"); + + // printf("\n=== 1.patch_embd_w ===\n"); + // w->w_v_patch_embed_proj[0]->debug(); + + // printf("\n=== 1.patch_embd_bias ===\n"); + // w->b_v_patch_embed_proj[0]->debug(); + + // printf("\n=== 2.pos_embd ===\n"); + // w->w_v_pos_embed[0]->debug(); + + // printf("\n=== 3.block0.norm1_w ===\n"); + // w->w_v_norm1[0]->debug(); + + // printf("\n=== 3.block0.norm1.bias ===\n"); + // w->b_v_norm1[0]->debug(); + + // printf("\n=== 3.block0.attn.qkv_w ===\n"); + // w->w_v_attn_qkv[0]->debug(); + + // printf("\n=== 3.block0.attn.qkv.bias ===\n"); + // w->b_v_attn_qkv[0]->debug(); + + // printf("\n=== 3.block0.attn.proj_w ===\n"); + // w->w_v_attn_proj[0]->debug(); + + // printf("\n=== 3.block0.attn.proj.bias ===\n"); + // w->b_v_attn_proj[0]->debug(); + + // printf("\n=== 4.block0.norm2_w ===\n"); + // w->w_v_norm2[0]->debug(); + + // printf("\n=== 4.block0.norm2.bias ===\n"); + // w->b_v_norm2[0]->debug(); + + // printf("\n=== 4.block0.mlp.fc1_w ===\n"); + // w->w_v_mlp_fc1[0]->debug(); + + // printf("\n=== 4.block0.mlp.fc1.bias ===\n"); + // w->b_v_mlp_fc1[0]->debug(); + + // printf("\n=== 4.block0.mlp.fc2_w ===\n"); + // w->w_v_mlp_fc2[0]->debug(); + + // printf("\n=== 4.block0.mlp.fc2.bias ===\n"); + // w->b_v_mlp_fc2[0]->debug(); + + // printf("\n=== 5.merger.norm_w ===\n"); + // w->w_v_merger_ln_q[0]->debug(); + + // printf("\n=== 5.merger.norm.bias ===\n"); + // w->b_v_merger_ln_q[0]->debug(); + + // printf("\n=== 5.merger.fc1_w ===\n"); + // w->w_v_merger_mlp_0[0]->debug(); + + // printf("\n=== 5.merger.fc1.bias ===\n"); + // w->b_v_merger_mlp_0[0]->debug(); + + // printf("\n=== 5.merger.fc2_w ===\n"); + // w->w_v_merger_mlp_2[0]->debug(); + + // printf("\n=== 5.merger.fc2.bias ===\n"); + // w->b_v_merger_mlp_2[0]->debug(); + + // printf("\n=== 6.deepstack0.norm_w ===\n"); + // w->w_v_merger_list_0_ln_q[0]->debug(); + + // printf("\n=== 6.deepstack0.norm.bias ===\n"); + // w->b_v_merger_list_0_ln_q[0]->debug(); + + // printf("\n=== 6.deepstack0.fc1_w ===\n"); + // w->w_v_merger_list_0_mlp_0[0]->debug(); + + // printf("\n=== 6.deepstack0.fc1.bias ===\n"); + // w->b_v_merger_list_0_mlp_0[0]->debug(); + + // printf("\n=== 6.deepstack0.fc2_w ===\n"); + // w->w_v_merger_list_0_mlp_2[0]->debug(); + + // printf("\n=== 6.deepstack0.fc2.bias ===\n"); + // w->b_v_merger_list_0_mlp_2[0]->debug(); + + // printf("\n=== 6.deepstack1.norm_w ===\n"); + // w->w_v_merger_list_1_ln_q[0]->debug(); + + // printf("\n=== 6.deepstack1.norm.bias ===\n"); + // w->b_v_merger_list_1_ln_q[0]->debug(); + + // printf("\n=== 6.deepstack1.fc1_w ===\n"); + // w->w_v_merger_list_1_mlp_0[0]->debug(); + + // printf("\n=== 6.deepstack1.fc1.bias ===\n"); + // w->b_v_merger_list_1_mlp_0[0]->debug(); + + // printf("\n=== 6.deepstack1.fc2_w ===\n"); + // w->w_v_merger_list_1_mlp_2[0]->debug(); + + // printf("\n=== 6.deepstack1.fc2.bias ===\n"); + // w->b_v_merger_list_1_mlp_2[0]->debug(); + + // printf("\n=== 6.deepstack2.norm_w ===\n"); + // w->w_v_merger_list_2_ln_q[0]->debug(); + + // printf("\n=== 6.deepstack2.norm.bias ===\n"); + // w->b_v_merger_list_2_ln_q[0]->debug(); + + // printf("\n=== 6.deepstack2.fc1_w ===\n"); + // w->w_v_merger_list_2_mlp_0[0]->debug(); + + // printf("\n=== 6.deepstack2.fc1.bias ===\n"); + // w->b_v_merger_list_2_mlp_0[0]->debug(); + + // printf("\n=== 6.deepstack2.fc2_w ===\n"); + // w->w_v_merger_list_2_mlp_2[0]->debug(); + + // printf("\n=== 6.deepstack2.fc2.bias ===\n"); + // w->b_v_merger_list_2_mlp_2[0]->debug(); + + printf("\n[DEBUG] Key tensors printed. Exiting...\n"); exit(0); auto nlayer = meta->nlayer; auto nkvh = meta->nkvh / ndev; diff --git a/src/models/qwen3_vl/qwen3_vl_weight.cpp b/src/models/qwen3_vl/qwen3_vl_weight.cpp index 1344e666..fae2fe24 100644 --- a/src/models/qwen3_vl/qwen3_vl_weight.cpp +++ b/src/models/qwen3_vl/qwen3_vl_weight.cpp @@ -154,16 +154,16 @@ inline void print_info(const Qwen3VLMeta &meta) { // nlayer : 28 // d : 2048 // dctx : 1024 -// dvoc : 152064 +// dvoc : 151936 // epsilon : 0.000001 // end_token : 151645 // nh : 16 // nkvh : 8 // dh : 128 // theta : 5000000.000000 -// vision_hidden_size : 768 -// vision_layers : 12 -// vision_heads : 12 +// vision_hidden_size : 1024 +// vision_layers : 24 +// vision_heads : 16 // patch_size : 16 // img_size : 768 // image_token_id : 151655 @@ -233,8 +233,6 @@ Qwen3VLWeights::Qwen3VLWeights( this->register_weight(W_NAME, W_VAR, i, infinicore::weights::DistributionType::W_DIST_TYPE); \ weight->W_VAR.push_back(W_VAR); - // auto W_VAR = Tensor::weight(nullptr, W_DTYPE, {W_DIM_2, W_DIM_1})->permute({1, 0}); - #define REGISTER_LAYER_WEIGHT_5D(W_NAME, W_VAR, W_DIM_1, W_DIM_2, W_DIM_3, W_DIM_4, W_DIM_5, W_DTYPE, W_DIST_TYPE) \ auto W_VAR = Tensor::weight(nullptr, W_DTYPE, {W_DIM_1, W_DIM_2, W_DIM_3, W_DIM_4, W_DIM_5}); \ this->register_weight(W_NAME, W_VAR, i, infinicore::weights::DistributionType::W_DIST_TYPE); \ diff --git a/src/tensor/tensor.cpp b/src/tensor/tensor.cpp index edf0faeb..37d8712a 100644 --- a/src/tensor/tensor.cpp +++ b/src/tensor/tensor.cpp @@ -267,7 +267,7 @@ void print_data_bf16(uint16_t const *data, const std::vector &shape, std::cout << std::endl; } else if (dim < shape.size() - 1) { for (size_t i = 0; i < shape[dim]; i++) { - print_data(data + i * strides[dim], shape, strides, dim + 1); + print_data_bf16(data + i * strides[dim], shape, strides, dim + 1); } } }