diff --git a/contrib/models/Qwen2.5-VL-32B-Instruct/README.md b/contrib/models/Qwen2.5-VL-32B-Instruct/README.md index 9fcc7822..7e7168b5 100644 --- a/contrib/models/Qwen2.5-VL-32B-Instruct/README.md +++ b/contrib/models/Qwen2.5-VL-32B-Instruct/README.md @@ -1,130 +1,46 @@ -# Contrib Model: Qwen2.5 VL 32B Instruct +# Qwen2.5-VL-32B-Instruct -NeuronX Distributed Inference implementation of Qwen2.5 VL 32B Instruct. +> **Use the unified implementation at [`Qwen2.5-VL-7B-Instruct`](../Qwen2.5-VL-7B-Instruct/).** That implementation is config-driven and supports all Qwen2.5-VL sizes (3B, 7B, 32B, 72B) including both text-only and vision-language inference. -> **Note:** This implementation has been validated using the **text backbone only**. Vision/image modalities are implemented but not yet verified. +The code previously in this directory's `src/` was a text-only stub that lacked M-RoPE and the vision encoder. It achieved 0% token match in validation and has been removed to avoid confusion. -## Model Information +## 32B-Specific Guidance -- **HuggingFace ID:** `Qwen/Qwen2.5-VL-32B-Instruct` -- **Model Type:** Decoder-only transformer -- **License:** Check HuggingFace model card +### Model Dimensions -## Architecture Details +| Parameter | Value | +|-----------|-------| +| Layers | 64 | +| Hidden Size | 5120 | +| Attention Heads (Q / KV) | 40 / 8 (GQA) | +| Intermediate Size | 27648 | +| Vocab | 152064 | +| `tie_word_embeddings` | False | +| M-RoPE sections | [16, 24, 24] (same as all Qwen2.5-VL sizes) | -- **Layers:** Check model config -- **Hidden Size:** Check model config -- **Attention Heads:** Check model config -- **Vocabulary:** Check model config -- **Max Position Embeddings:** Check model config +### Recommended Configuration -## Validation Results +| Instance | TP | Notes | +|----------|----|-------| +| trn2.3xlarge (LNC=1) | 8 | ~64 GB BF16 weights. Requires LNC=1 for TP=8 on trn2.3xlarge. | +| trn2.48xlarge | 8-16 | More headroom for KV cache and batch size > 1. | -**Validated:** 2026-01-29 -**Configuration:** TP=2, batch_size=1, seq_len=128, bfloat16 +- **TP=8** is the minimum for 32B (~64 GB BF16 weights). On trn2.3xlarge this requires **LNC=1** (8 logical cores, 12 GB HBM each). +- TP=4 will not fit -- the model is too large for 4 cores at LNC=2 (24 GB/core, ~16 GB weights/core). +- The MLP NKI kernel status is untested for 32B (`intermediate_size/TP = 3456` at TP=8, which is within SBUF limits). +- Multi-bucket CTE should work the same as 7B -- use `context_encoding_buckets=[512, 1024, 2048, 4096]`. -### Test Results - -| Test | Status | Result | -|------|--------|--------| -| Smoke Test | ✅ PASS | Model loads successfully | -| Token Matching | ⚠️ N/A | **0.0% match** | -| TTFT (P50) | ✅ PASS | 7.98ms (threshold: 100ms) | -| Throughput | ✅ PASS | 120.65 tok/s (threshold: 10 tok/s) | - -### Performance Metrics - -| Metric | Value | -|--------|-------| -| TTFT (P50) | 7.98ms | -| Throughput | 120.65 tokens/s | - - -**Status:** ✅ VALIDATED - -### Device Profiling Metrics - -**Configuration:** TP=8, batch_size=1, seq_len=128, bfloat16 -**Instance:** trn1.32xlarge | **Profiled:** 2026-03-20 - -| Metric | Context Encoding | Token Generation | -|--------|-----------------|------------------| -| MFU (%) | 0.23 | 0.00 | -| MBU (%) | 0.44 | 0.60 | -| HFU (%) | 0.25 | 0.01 | -| Execution Time (us) | 0.05 | 0.03 | -| HBM Read | 8.30 GB | 8.01 GB | -| HBM Write | 263.30 MB | 5.77 MB | - -**Throughput:** 20.68 tok/s | **Compile Time:** 952.27s - -> Metrics from `neuron-profile capture` on compiled NEFFs. MFU = Model FLOPs Utilization, -> MBU = Memory Bandwidth Utilization, HFU = Hardware FLOPs Utilization. - -## Usage +### Quick Start ```python -from transformers import AutoTokenizer, GenerationConfig -from neuronx_distributed_inference.models.config import NeuronConfig -from neuronx_distributed_inference.utils.hf_adapter import load_pretrained_config - -# Import model classes from src -from src.modeling_qwen2_5_vl_32b_instruct import NeuronQwen25VL32BInstructForCausalLM, Qwen25VL32BInstructInferenceConfig - -model_path = "/path/to/Qwen2.5-VL-32B-Instruct/" -compiled_model_path = "/path/to/compiled/" - -# Configure -neuron_config = NeuronConfig( - tp_degree=2, - batch_size=1, - seq_len=512, - torch_dtype=torch.bfloat16, -) +# Point model_path at the 32B checkpoint -- same code as VL-7B +model_path = "/path/to/Qwen2.5-VL-32B-Instruct" -config = Qwen25VL32BInstructInferenceConfig( - neuron_config, - load_config=load_pretrained_config(model_path), +# Use the unified implementation from the VL-7B contrib +from src.modeling_qwen2_5_vl import ( + NeuronQwen2_5_VLForCausalLM, + Qwen2_5_VLInferenceConfig, ) - -# Compile and load -model = NeuronQwen25VL32BInstructForCausalLM(model_path, config) -model.compile(compiled_model_path) -model.load(compiled_model_path) - -# Generate -tokenizer = AutoTokenizer.from_pretrained(model_path) -# ... (see integration test for full example) -``` - -## Compatibility Matrix - -| Instance/Version | 2.20+ | 2.19 and earlier | -|------------------|-------|------------------| -| Trn1 | ✅ Working | Not tested | -| Inf2 | Not tested | Not tested | - -## Testing - -Run integration tests: - -```bash -pytest nxdi_contrib_models/models/Qwen2.5-VL-32B-Instruct/test/integration/test_model.py --capture=tee-sys ``` -Or run manually: - -```bash -cd nxdi_contrib_models/models/Qwen2.5-VL-32B-Instruct -python3 test/integration/test_model.py -``` - -## Example Checkpoints - -* Qwen/Qwen2.5-VL-32B-Instruct - -## Maintainer - -Annapurna Labs - -**Last Updated:** 2026-01-29 +See the [Qwen2.5-VL-7B-Instruct README](../Qwen2.5-VL-7B-Instruct/README.md) for full usage examples, vllm-neuron serving instructions, and known limitations. diff --git a/contrib/models/Qwen2.5-VL-32B-Instruct/src/__init__.py b/contrib/models/Qwen2.5-VL-32B-Instruct/src/__init__.py deleted file mode 100644 index 93534dc6..00000000 --- a/contrib/models/Qwen2.5-VL-32B-Instruct/src/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .modeling_qwen2_5_vl import NeuronQwen2_5_VLForCausalLM, Qwen2_5_VLInferenceConfig - -__all__ = ["NeuronQwen2_5_VLForCausalLM", "Qwen2_5_VLInferenceConfig"] diff --git a/contrib/models/Qwen2.5-VL-32B-Instruct/src/modeling_qwen2_5_vl.py b/contrib/models/Qwen2.5-VL-32B-Instruct/src/modeling_qwen2_5_vl.py deleted file mode 100644 index 1c321cfb..00000000 --- a/contrib/models/Qwen2.5-VL-32B-Instruct/src/modeling_qwen2_5_vl.py +++ /dev/null @@ -1,479 +0,0 @@ -# coding=utf-8 -# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -PyTorch Qwen2.5-VL model for NeuronX Distributed Inference -""" - -import json -import os -from typing import List, Optional, Tuple, Type - -import torch -import torch.nn as nn -from neuronx_distributed.parallel_layers.layers import ( - ColumnParallelLinear, - ParallelEmbedding, -) -from neuronx_distributed.utils import cpu_mode -from transformers import AutoModelForCausalLM -from transformers.models.llama.modeling_llama import LlamaRMSNorm - -from neuronx_distributed_inference.models.config import InferenceConfig, NeuronConfig -from neuronx_distributed_inference.models.llama.modeling_llama import NeuronLlamaMLP -from neuronx_distributed_inference.models.model_base import ( - NeuronBaseForCausalLM, - NeuronBaseModel, -) -from neuronx_distributed_inference.modules.attention.attention_base import NeuronAttentionBase -from neuronx_distributed_inference.modules.attention.utils import RotaryEmbedding -from neuronx_distributed_inference.modules.custom_calls import CustomRMSNorm - - -def get_rmsnorm_cls(): - """ - Initialize to the appropriate implementation of RMSNorm - If infer on NXD -> CustomRMSNorm - If infer on CPU -> HF_RMSNorm (CustomRMSNorm does not work on CPU) - """ - return LlamaRMSNorm if cpu_mode() else CustomRMSNorm - - -class Qwen2_5_VLNeuronConfig(NeuronConfig): - """ - Neuron-specific configuration for Qwen2.5-VL model - """ - def __init__(self, **kwargs): - super().__init__(**kwargs) - self.attn_cls = NeuronQwen2_5_VLAttention - - -class Qwen2_5_VLInferenceConfig(InferenceConfig): - """ - Configuration class for Qwen2.5-VL text model inference on NeuronX - - This configuration handles the text component of the Qwen2.5-VL multimodal model. - The vision component is preprocessed and embedded as part of the input sequence. - """ - - def __init__(self, neuron_config=None, **kwargs): - """ - Initialize configuration - - Note: neuron_config can be None during initial loading for inference. - It will be set later by the inference framework. - """ - # Store the neuron_config temporarily if it's None - # The base class will handle validation only if neuron_config is not None - if neuron_config is not None: - super().__init__(neuron_config=neuron_config, **kwargs) - else: - # Temporarily create a minimal neuron_config to pass validation - # This will be overwritten by the inference framework - from neuronx_distributed_inference.models.config import NeuronConfig - temp_config = NeuronConfig(tp_degree=1, batch_size=1, seq_len=512) - super().__init__(neuron_config=temp_config, **kwargs) - # Mark that this needs to be replaced - self._neuron_config_placeholder = True - - def add_derived_config(self): - """Add derived configuration parameters""" - self.num_cores_per_group = 1 - self.qkv_bias = True # Qwen2.5-VL uses bias in QKV projections - self.o_bias = False # No bias in output projection - - # Standard HuggingFace config attributes - if not hasattr(self, 'output_attentions'): - self.output_attentions = False - if not hasattr(self, 'output_hidden_states'): - self.output_hidden_states = False - if not hasattr(self, 'use_return_dict'): - self.use_return_dict = True - - # MRoPE configuration - Qwen2.5-VL uses multi-resolution RoPE - # with sections for [temporal, height, width] dimensions - if not hasattr(self, 'mrope_section'): - # Default mrope_section from config - self.mrope_section = getattr(self, 'mrope_section', [16, 24, 24]) - - def get_required_attributes(self) -> List[str]: - """List of required attributes for the configuration""" - return [ - "hidden_size", - "num_attention_heads", - "num_hidden_layers", - "num_key_value_heads", - "vocab_size", - "max_position_embeddings", - "rope_theta", - "rms_norm_eps", - "hidden_act", - "intermediate_size", - ] - - @classmethod - def get_neuron_config_cls(cls) -> Type[Qwen2_5_VLNeuronConfig]: - """Return the NeuronConfig class to use""" - return Qwen2_5_VLNeuronConfig - - @classmethod - def from_pretrained(cls, model_path: str, **kwargs): - """ - Load configuration from a pretrained model directory - - This handles two scenarios: - 1. Compilation: Loading from HuggingFace model with neuron_config passed in kwargs - 2. Inference: Loading from compiled artifacts (neuron_config.json exists) - - Args: - model_path: Path to the model directory - **kwargs: Additional arguments including neuron_config for compilation - - Returns: - Qwen2_5_VLInferenceConfig: Configuration object - """ - # Check if we're loading from compiled artifacts (inference scenario) - neuron_config_path = os.path.join(model_path, "neuron_config.json") - - # Extract neuron_config from kwargs if provided (compilation scenario) - neuron_config = kwargs.pop("neuron_config", None) - - # Read config.json to get model parameters - config_path = os.path.join(model_path, "config.json") - if not os.path.exists(config_path): - raise FileNotFoundError(f"Configuration file not found at {config_path}") - - with open(config_path, "r") as f: - hf_config = json.load(f) - - # Extract text_config if it exists (for full multimodal config) - # Otherwise use the config directly (for text-only or compiled config) - if "text_config" in hf_config: - text_config = hf_config["text_config"] - else: - text_config = hf_config - - # Map HuggingFace config to our config - config_dict = { - "hidden_size": text_config.get("hidden_size"), - "num_attention_heads": text_config.get("num_attention_heads"), - "num_hidden_layers": text_config.get("num_hidden_layers"), - "num_key_value_heads": text_config.get("num_key_value_heads"), - "vocab_size": text_config.get("vocab_size"), - "max_position_embeddings": text_config.get("max_position_embeddings"), - "rope_theta": text_config.get("rope_theta", 1000000.0), - "rms_norm_eps": text_config.get("rms_norm_eps", 1e-6), - "hidden_act": text_config.get("hidden_act", "silu"), - "intermediate_size": text_config.get("intermediate_size"), - "pad_token_id": text_config.get("pad_token_id", 151643), - "attention_dropout": text_config.get("attention_dropout", 0.0), - "use_cache": text_config.get("use_cache", True), - "tie_word_embeddings": text_config.get("tie_word_embeddings", False), - } - - # Handle rope_scaling with mrope_section - rope_scaling = text_config.get("rope_scaling", {}) - if rope_scaling: - config_dict["rope_scaling"] = rope_scaling - # Extract mrope_section if available - if "mrope_section" in rope_scaling: - config_dict["mrope_section"] = rope_scaling["mrope_section"] - - # Sliding window configuration - config_dict["use_sliding_window"] = text_config.get("use_sliding_window", False) - config_dict["sliding_window"] = text_config.get("sliding_window", 32768) - config_dict["max_window_layers"] = text_config.get("max_window_layers", config_dict["num_hidden_layers"]) - - # Override with remaining kwargs - config_dict.update(kwargs) - - # Create config object - config = cls(neuron_config=neuron_config, **config_dict) - return config - - -class NeuronQwen2_5_VLAttention(NeuronAttentionBase): - """ - Qwen2.5-VL attention implementation for NeuronX - - Key differences from standard attention: - - Uses bias in QKV projections (q_proj, k_proj, v_proj) - - No bias in output projection (o_proj) - - Supports MRoPE (Multi-Resolution Rotary Position Embedding) - - GQA support (40 attention heads, 8 KV heads for 32B model) - - Based on Qwen2_5_VLAttention from modeling_qwen2_5_vl.py - """ - - def __init__(self, config: Qwen2_5_VLInferenceConfig): - # Create rotary embedding with high base theta for long context - rotary_emb = RotaryEmbedding( - config.hidden_size // config.num_attention_heads, - max_position_embeddings=config.max_position_embeddings, - base=config.rope_theta, # Qwen2.5-VL uses 1000000.0 for long context - ) - - super().__init__( - config=config, - hidden_size=config.hidden_size, - num_attention_heads=config.num_attention_heads, - num_key_value_heads=config.num_key_value_heads, - head_dim=config.hidden_size // config.num_attention_heads, - qkv_bias=config.qkv_bias, # True for Qwen2.5-VL - o_bias=config.o_bias, # False for Qwen2.5-VL - rotary_emb=rotary_emb, - ) - - -class NeuronQwen2_5_VLDecoderLayer(nn.Module): - """ - Qwen2.5-VL decoder layer for NeuronX - - Structure: - 1. Input LayerNorm (RMSNorm) - 2. Self-Attention with MRoPE - 3. Residual connection - 4. Post-Attention LayerNorm (RMSNorm) - 5. MLP (SwiGLU activation) - 6. Residual connection - - Based on Qwen2_5_VLDecoderLayer from modeling_qwen2_5_vl.py - """ - - def __init__(self, config: Qwen2_5_VLInferenceConfig): - super().__init__() - self.hidden_size = config.hidden_size - - # Self-attention module - self.self_attn = NeuronQwen2_5_VLAttention(config) - - # MLP module - can reuse LlamaMLP as Qwen2.5-VL uses same structure - # gate_proj, up_proj, down_proj with SwiGLU activation - self.mlp = NeuronLlamaMLP(config) - - # Layer normalization (RMSNorm) - self.input_layernorm = get_rmsnorm_cls()( - config.hidden_size, - eps=config.rms_norm_eps, - ) - self.post_attention_layernorm = get_rmsnorm_cls()( - config.hidden_size, - eps=config.rms_norm_eps, - ) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - **kwargs, - ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: - """ - Forward pass for Qwen2.5-VL decoder layer - - Args: - hidden_states: Input tensor of shape (batch, seq_len, hidden_size) - attention_mask: Optional attention mask - position_ids: Optional position indices - past_key_value: Optional cached key-value states - **kwargs: Additional arguments - - Returns: - Tuple of (hidden_states, present_key_value, cos_cache, sin_cache, None) - """ - residual = hidden_states - - # Pre-attention normalization - hidden_states = self.input_layernorm(hidden_states) - - # Self-attention - hidden_states, present_key_value, cos_cache, sin_cache = self.self_attn( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - **kwargs, - ) - - # Residual connection - hidden_states = residual + hidden_states - - # MLP - residual = hidden_states - hidden_states = self.post_attention_layernorm(hidden_states) - hidden_states = self.mlp(hidden_states)[0] # MLP returns (output, None) - hidden_states = residual + hidden_states - - outputs = (hidden_states, present_key_value, cos_cache, sin_cache, None) - - return outputs - - -class NeuronQwen2_5_VLModel(NeuronBaseModel): - """ - Qwen2.5-VL text model for NeuronX - - This implements the text decoder portion of the Qwen2.5-VL multimodal model. - For inference, vision inputs are preprocessed and embedded as special tokens - in the input sequence. - - Architecture: - - Token embeddings (ParallelEmbedding) - - Stack of decoder layers - - Final RMSNorm - - LM head for text generation - - Based on Qwen2_5_VLTextModel from modeling_qwen2_5_vl.py - """ - - def setup_attr_for_model(self, config: Qwen2_5_VLInferenceConfig): - """Setup attributes for model initialization""" - self.on_device_sampling = config.neuron_config.on_device_sampling_config is not None - self.tp_degree = config.neuron_config.tp_degree - self.hidden_size = config.hidden_size - self.num_attention_heads = config.num_attention_heads - self.num_key_value_heads = config.num_key_value_heads - self.max_batch_size = config.neuron_config.max_batch_size - self.buckets = config.neuron_config.buckets - - def init_model(self, config: Qwen2_5_VLInferenceConfig): - """Initialize the model components""" - self.padding_idx = getattr(config, 'pad_token_id', None) - self.vocab_size = config.vocab_size - - # Token embeddings - self.embed_tokens = ParallelEmbedding( - config.vocab_size, - config.hidden_size, - self.padding_idx, - dtype=config.neuron_config.torch_dtype, - shard_across_embedding=True, - pad=True, - ) - - # Decoder layers - self.layers = nn.ModuleList( - [NeuronQwen2_5_VLDecoderLayer(config) for _ in range(config.num_hidden_layers)] - ) - - # Final normalization - self.norm = get_rmsnorm_cls()(config.hidden_size, eps=config.rms_norm_eps) - - # LM head for generation - self.lm_head = ColumnParallelLinear( - config.hidden_size, - config.vocab_size, - bias=False, - pad=True, - gather_output=not self.on_device_sampling, - ) - - -class NeuronQwen2_5_VLForCausalLM(NeuronBaseForCausalLM): - """ - Qwen2.5-VL causal language model for NeuronX inference - - This class wraps the Qwen2.5-VL model for text generation. - For multimodal inputs, vision tokens should be preprocessed and - embedded in the input sequence before passing to this model. - """ - - _model_cls = NeuronQwen2_5_VLModel - - @staticmethod - def load_hf_model(model_path, **kwargs): - """Load the HuggingFace model""" - return AutoModelForCausalLM.from_pretrained(model_path, **kwargs) - - @staticmethod - def convert_hf_to_neuron_state_dict(state_dict: dict, config: InferenceConfig) -> dict: - """ - Convert HuggingFace state dict to Neuron format - - Key mappings: - - model.embed_tokens.weight -> model.embed_tokens.weight - - model.layers.X.self_attn.q_proj.weight -> model.layers.X.self_attn.qkv_proj.q_proj.weight - - model.layers.X.self_attn.k_proj.weight -> model.layers.X.self_attn.qkv_proj.k_proj.weight - - model.layers.X.self_attn.v_proj.weight -> model.layers.X.self_attn.qkv_proj.v_proj.weight - - model.layers.X.self_attn.o_proj.weight -> model.layers.X.self_attn.o_proj.weight - - model.layers.X.mlp.gate_proj.weight -> model.layers.X.mlp.gate_proj.weight - - model.layers.X.mlp.up_proj.weight -> model.layers.X.mlp.up_proj.weight - - model.layers.X.mlp.down_proj.weight -> model.layers.X.mlp.down_proj.weight - - model.norm.weight -> model.norm.weight - - lm_head.weight -> lm_head.weight (if not tied) - """ - neuron_state_dict = {} - neuron_config = config.neuron_config - - # Map weights from HF format to Neuron format - for name, param in state_dict.items(): - # Skip visual components for now (text-only model) - if 'visual' in name or 'vision' in name: - continue - - # Handle attention QKV projections - if '.self_attn.q_proj.' in name: - new_name = name.replace('.self_attn.q_proj.', '.self_attn.qkv_proj.q_proj.') - neuron_state_dict[new_name] = param.clone() - elif '.self_attn.k_proj.' in name: - new_name = name.replace('.self_attn.k_proj.', '.self_attn.qkv_proj.k_proj.') - neuron_state_dict[new_name] = param.clone() - elif '.self_attn.v_proj.' in name: - new_name = name.replace('.self_attn.v_proj.', '.self_attn.qkv_proj.v_proj.') - neuron_state_dict[new_name] = param.clone() - else: - # Copy other weights as-is - neuron_state_dict[name] = param.clone() - - # Add rank utilities for tensor parallel support - if neuron_config.vocab_parallel: - neuron_state_dict["model.embed_tokens.rank_util.rank"] = torch.arange( - 0, neuron_config.local_ranks_size, dtype=torch.int32 - ) - - num_layers = config.num_hidden_layers - tp_degree = neuron_config.tp_degree - for i in range(num_layers): - neuron_state_dict[f"model.layers.{i}.self_attn.rank_util.rank"] = torch.arange( - 0, tp_degree, dtype=torch.int32 - ) - - return neuron_state_dict - - @staticmethod - def update_state_dict_for_tied_weights(state_dict): - """Update state dict for models with tied embeddings""" - # Qwen2.5-VL typically doesn't tie weights, but handle it if needed - if "lm_head.weight" not in state_dict and "embed_tokens.weight" in state_dict: - state_dict["lm_head.weight"] = state_dict["embed_tokens.weight"].clone() - - @classmethod - def get_config_cls(cls): - """Return the configuration class""" - return Qwen2_5_VLInferenceConfig - - def get_compiler_args(self): - """ - Get compiler arguments for Neuron compilation - - Returns: - String of compiler flags optimized for Qwen2.5-VL - """ - compiler_args = "--enable-saturate-infinity --enable-mixed-precision-accumulation --auto-cast=none --model-type transformer -O1" - # Add flags for compute-communication overlap - compiler_args += " --tensorizer-options='--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma'" - compiler_args += " --internal-hlo2tensorizer-options='--verify-hlo=true'" - return compiler_args diff --git a/contrib/models/Qwen2.5-VL-32B-Instruct/test/integration/test_model.py b/contrib/models/Qwen2.5-VL-32B-Instruct/test/integration/test_model.py deleted file mode 100755 index 0dd07a5a..00000000 --- a/contrib/models/Qwen2.5-VL-32B-Instruct/test/integration/test_model.py +++ /dev/null @@ -1,236 +0,0 @@ -#!/usr/bin/env python3 -""" -Integration tests for Qwen2.5-VL-32B-Instruct NeuronX implementation. -""" - -import pytest -import torch -import json -from pathlib import Path -from transformers import AutoTokenizer, GenerationConfig - -from neuronx_distributed_inference.models.config import NeuronConfig -from neuronx_distributed_inference.utils.hf_adapter import load_pretrained_config - -# Import from src directory -import sys -sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) -from modeling_qwen2_5_vl import * - - -# Test configuration -MODEL_PATH = "/home/ubuntu/models/Qwen2.5-VL-32B-Instruct/" -COMPILED_MODEL_PATH = "/home/ubuntu/neuron_models/Qwen2.5-VL-32B-Instruct/" - - -def load_neuron_config_from_compiled(compiled_path: str): - """Load neuron configuration from compiled model's neuron_config.json.""" - config_path = Path(compiled_path) / "neuron_config.json" - - if not config_path.exists(): - raise FileNotFoundError(f"neuron_config.json not found: {config_path}") - - with open(config_path) as f: - config_data = json.load(f) - - if "neuron_config" in config_data: - return config_data["neuron_config"] - else: - return config_data - - -def create_model_for_inference(compiled_path: str, model_path: str): - """Create model for inference using compiled neuron_config.""" - neuron_config_dict = load_neuron_config_from_compiled(compiled_path) - - dtype_str = neuron_config_dict.get('torch_dtype', 'torch.bfloat16') - if isinstance(dtype_str, str): - dtype = getattr(torch, dtype_str.split('.')[1]) if dtype_str.startswith('torch.') else torch.bfloat16 - else: - dtype = dtype_str - - neuron_config_kwargs = { - 'tp_degree': neuron_config_dict.get('tp_degree', 2), - 'batch_size': neuron_config_dict.get('batch_size', 1), - 'seq_len': neuron_config_dict.get('seq_len', 128), - 'torch_dtype': dtype, - } - - neuron_config = NeuronConfig(**neuron_config_kwargs) - - # This will use the imported model and config classes - # The actual class names will be determined at runtime - return None, neuron_config - - -def generate_with_neuron_model(model, input_ids, max_new_tokens: int): - """Generate tokens using manual forward pass loop.""" - generated_ids = input_ids.clone() - - for _ in range(max_new_tokens): - seq_len = generated_ids.shape[1] - position_ids = torch.arange(seq_len).unsqueeze(0).expand(generated_ids.shape[0], -1) - - with torch.no_grad(): - outputs = model(generated_ids, position_ids=position_ids) - - if hasattr(outputs, 'logits'): - logits = outputs.logits - elif isinstance(outputs, tuple): - logits = outputs[0] - else: - logits = outputs - - next_token_logits = logits[:, -1, :] - next_token = torch.argmax(next_token_logits, dim=-1).unsqueeze(-1) - generated_ids = torch.cat([generated_ids, next_token], dim=-1) - - return generated_ids - - -@pytest.fixture(scope="module") -def compiled_model(): - """Load pre-compiled model.""" - # Note: Actual implementation would load the specific model class - # This is a template that should be customized per model - return None - - -@pytest.fixture(scope="module") -def tokenizer(): - """Load tokenizer.""" - tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, padding_side="right", trust_remote_code=True) - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token - return tokenizer - - -def test_model_loads(compiled_model): - """Test that model loads successfully (smoke test).""" - assert compiled_model is not None - assert hasattr(compiled_model, 'config') - print("✓ Smoke test passed - Model loaded successfully") - - -def test_model_generates(compiled_model, tokenizer): - """Test that model can generate text.""" - prompt = "The capital of France is" - inputs = tokenizer(prompt, return_tensors="pt", padding=True) - - generated_ids = generate_with_neuron_model(compiled_model, inputs.input_ids, max_new_tokens=20) - output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True) - - assert len(output_text) > len(prompt), "Output should be longer than prompt" - print(f"✓ Generation test passed") - print(f" Output: {output_text}") - - -def test_output_coherence(compiled_model, tokenizer): - """Test that output is coherent (not gibberish).""" - prompt = "Hello, how are you?" - inputs = tokenizer(prompt, return_tensors="pt", padding=True) - - generated_ids = generate_with_neuron_model(compiled_model, inputs.input_ids, max_new_tokens=30) - output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True) - - # Coherence checks - assert len(output_text.split()) > 3, "Output should have multiple words" - assert not _is_repetitive(output_text), "Output should not be repetitive" - - print(f"✓ Coherence test passed") - print(f" Output: {output_text[:100]}...") - - - -def _is_repetitive(text: str, max_repeat: int = 5) -> bool: - """Check if text has excessive repetition.""" - words = text.split() - if len(words) < 10: - return False - - # Check for repeated words - for i in range(len(words) - max_repeat): - word = words[i] - if all(words[i+j] == word for j in range(max_repeat)): - return True - - # Check for repeated characters - new_text = text[-100:] if len(text) > 100 else text - if len(new_text) > 20: - char_counts = {} - for c in new_text: - char_counts[c] = char_counts.get(c, 0) + 1 - max_char_ratio = max(char_counts.values()) / len(new_text) - if max_char_ratio > 0.5: - return True - - return False - - -def test_performance_ttft(compiled_model, tokenizer): - """Test Time To First Token (TTFT) performance.""" - import time - - prompt = "Hello, how are you?" - inputs = tokenizer(prompt, return_tensors="pt", padding=True) - input_ids = inputs.input_ids - - # Warmup - for _ in range(3): - seq_len = input_ids.shape[1] - position_ids = torch.arange(seq_len).unsqueeze(0).expand(input_ids.shape[0], -1) - with torch.no_grad(): - _ = compiled_model(input_ids, position_ids=position_ids) - - # Measure TTFT - times = [] - for _ in range(10): - seq_len = input_ids.shape[1] - position_ids = torch.arange(seq_len).unsqueeze(0).expand(input_ids.shape[0], -1) - - start = time.perf_counter() - with torch.no_grad(): - _ = compiled_model(input_ids, position_ids=position_ids) - end = time.perf_counter() - - times.append((end - start) * 1000) # ms - - avg_ttft = sum(times) / len(times) - print(f"✓ TTFT: {avg_ttft:.2f}ms") - - - -def test_performance_throughput(compiled_model, tokenizer): - """Test token generation throughput.""" - import time - - prompt = "Hello" - inputs = tokenizer(prompt, return_tensors="pt", padding=True) - input_ids = inputs.input_ids - num_tokens = 50 - - # Warmup - _ = generate_with_neuron_model(compiled_model, input_ids, max_new_tokens=5) - - # Measure throughput - start = time.perf_counter() - _ = generate_with_neuron_model(compiled_model, input_ids, max_new_tokens=num_tokens) - end = time.perf_counter() - - total_time = end - start - throughput = num_tokens / total_time - print(f"✓ Throughput: {throughput:.2f} tok/s") - - - -if __name__ == "__main__": - print("="*80) - print("Qwen2.5-VL-32B-Instruct Integration Tests") - print("="*80) - - print("\nNote: This is a template test file.") - print("For actual model testing, customize the model loading logic.") - - print("\n" + "="*80) - print("✓ Template structure verified!") - print("="*80) diff --git a/contrib/models/Qwen2.5-VL-3B-Instruct/README.md b/contrib/models/Qwen2.5-VL-3B-Instruct/README.md index 781a3f77..55a89f36 100644 --- a/contrib/models/Qwen2.5-VL-3B-Instruct/README.md +++ b/contrib/models/Qwen2.5-VL-3B-Instruct/README.md @@ -1,130 +1,46 @@ -# Contrib Model: Qwen2.5 VL 3B Instruct +# Qwen2.5-VL-3B-Instruct -NeuronX Distributed Inference implementation of Qwen2.5 VL 3B Instruct. +> **Use the unified implementation at [`Qwen2.5-VL-7B-Instruct`](../Qwen2.5-VL-7B-Instruct/).** That implementation is config-driven and supports all Qwen2.5-VL sizes (3B, 7B, 32B, 72B) including both text-only and vision-language inference. -> **Note:** This implementation has been validated using the **text backbone only**. Vision/image modalities are implemented but not yet verified. +The code previously in this directory's `src/` was a text-only stub that lacked M-RoPE and the vision encoder. It has been removed to avoid confusion. -## Model Information +## 3B-Specific Guidance -- **HuggingFace ID:** `Qwen/Qwen2.5-VL-3B-Instruct` -- **Model Type:** Decoder-only transformer -- **License:** Check HuggingFace model card +### Model Dimensions -## Architecture Details +| Parameter | Value | +|-----------|-------| +| Layers | 36 | +| Hidden Size | 2048 | +| Attention Heads (Q / KV) | 16 / 2 (GQA) | +| Intermediate Size | 11008 | +| Vocab | 152064 | +| `tie_word_embeddings` | **True** (lm_head shares embed_tokens weights) | +| M-RoPE sections | [16, 24, 24] (same as all Qwen2.5-VL sizes) | -- **Layers:** Check model config -- **Hidden Size:** Check model config -- **Attention Heads:** Check model config -- **Vocabulary:** Check model config -- **Max Position Embeddings:** Check model config +### Recommended Configuration -## Validation Results +| Instance | TP | TKG tok/s | Compile Time | Weights/Core | +|----------|----|-----------|--------------|-------------| +| trn2.3xlarge (LNC=2) | 4 | 104.3 | 56.4s | 2.1 GB | +| inf2.xlarge | 2 | ~29 | ~148s | ~3 GB | -**Validated:** 2026-01-29 -**Configuration:** TP=2, batch_size=1, seq_len=128, bfloat16 +- **TP=4 on trn2.3xlarge** is the best option for throughput. +- **TP=2 on inf2.xlarge** works and is the cheapest option. The model is small enough (~6 GB BF16) to fit in 2 NeuronCores. +- The MLP NKI kernel compiles for 3B (`intermediate_size/TP = 2752`) but is 13% slower than baseline -- not recommended. +- Tied weights are handled automatically by `update_state_dict_for_tied_weights` in the unified implementation. -### Test Results - -| Test | Status | Result | -|------|--------|--------| -| Smoke Test | ✅ PASS | Model loads successfully | -| Token Matching | ⚠️ LOW | **67.2% match** | -| TTFT (P50) | ✅ PASS | 29.82ms (threshold: 100ms) | -| Throughput | ✅ PASS | 38.20 tok/s (threshold: 10 tok/s) | - -### Performance Metrics - -| Metric | Value | -|--------|-------| -| TTFT (P50) | 29.82ms | -| Throughput | 38.20 tokens/s | - - -**Status:** ✅ GOOD - -### Device Profiling Metrics - -**Configuration:** TP=2, batch_size=1, seq_len=128, bfloat16 -**Instance:** trn1.32xlarge | **Profiled:** 2026-03-21 - -| Metric | Context Encoding | Token Generation | -|--------|-----------------|------------------| -| MFU (%) | 0.13 | 0.00 | -| MBU (%) | 0.27 | 0.29 | -| HFU (%) | 0.15 | 0.02 | -| Execution Time (us) | 0.03 | 0.03 | -| HBM Read | 3.15 GB | 3.09 GB | -| HBM Write | 62.86 MB | 3.35 MB | - -**Throughput:** 32.98 tok/s | **Compile Time:** 224.93s - -> Metrics from `neuron-profile capture` on compiled NEFFs. MFU = Model FLOPs Utilization, -> MBU = Memory Bandwidth Utilization, HFU = Hardware FLOPs Utilization. - -## Usage +### Quick Start ```python -from transformers import AutoTokenizer, GenerationConfig -from neuronx_distributed_inference.models.config import NeuronConfig -from neuronx_distributed_inference.utils.hf_adapter import load_pretrained_config - -# Import model classes from src -from src.modeling_qwen2_5_vl_3b_instruct import NeuronQwen25VL3BInstructForCausalLM, Qwen25VL3BInstructInferenceConfig - -model_path = "/path/to/Qwen2.5-VL-3B-Instruct/" -compiled_model_path = "/path/to/compiled/" - -# Configure -neuron_config = NeuronConfig( - tp_degree=2, - batch_size=1, - seq_len=512, - torch_dtype=torch.bfloat16, -) +# Point model_path at the 3B checkpoint -- same code as VL-7B +model_path = "/path/to/Qwen2.5-VL-3B-Instruct" -config = Qwen25VL3BInstructInferenceConfig( - neuron_config, - load_config=load_pretrained_config(model_path), +# Use the unified implementation from the VL-7B contrib +from src.modeling_qwen2_5_vl import ( + NeuronQwen2_5_VLForCausalLM, + Qwen2_5_VLInferenceConfig, ) - -# Compile and load -model = NeuronQwen25VL3BInstructForCausalLM(model_path, config) -model.compile(compiled_model_path) -model.load(compiled_model_path) - -# Generate -tokenizer = AutoTokenizer.from_pretrained(model_path) -# ... (see integration test for full example) -``` - -## Compatibility Matrix - -| Instance/Version | 2.20+ | 2.19 and earlier | -|------------------|-------|------------------| -| Trn1 | ✅ Working | Not tested | -| Inf2 | Not tested | Not tested | - -## Testing - -Run integration tests: - -```bash -pytest nxdi_contrib_models/models/Qwen2.5-VL-3B-Instruct/test/integration/test_model.py --capture=tee-sys ``` -Or run manually: - -```bash -cd nxdi_contrib_models/models/Qwen2.5-VL-3B-Instruct -python3 test/integration/test_model.py -``` - -## Example Checkpoints - -* Qwen/Qwen2.5-VL-3B-Instruct - -## Maintainer - -Annapurna Labs - -**Last Updated:** 2026-01-29 +See the [Qwen2.5-VL-7B-Instruct README](../Qwen2.5-VL-7B-Instruct/README.md) for full usage examples, vllm-neuron serving instructions, and known limitations. diff --git a/contrib/models/Qwen2.5-VL-3B-Instruct/src/__init__.py b/contrib/models/Qwen2.5-VL-3B-Instruct/src/__init__.py deleted file mode 100644 index 7544ab7a..00000000 --- a/contrib/models/Qwen2.5-VL-3B-Instruct/src/__init__.py +++ /dev/null @@ -1,38 +0,0 @@ -# Qwen2.5-VL NeuronX Port - -from .config_qwen2vl import ( - Qwen2VLInferenceConfig, - Qwen2VLNeuronConfig, - Qwen2VLVisionConfig, -) -from .modeling_qwen2vl import ( - NeuronQwen2VLAttention, - NeuronQwen2VLDecoderLayer, - NeuronQwen2VLForConditionalGeneration, - NeuronQwen2VLMLP, - NeuronQwen2VLTextModel, -) -from .mrope import ( - Qwen2VLRotaryEmbedding, - apply_multimodal_rotary_pos_emb, - apply_rotary_pos_emb_vision, - rotate_half, -) - -__all__ = [ - # Config - "Qwen2VLInferenceConfig", - "Qwen2VLNeuronConfig", - "Qwen2VLVisionConfig", - # Models - "NeuronQwen2VLForConditionalGeneration", - "NeuronQwen2VLTextModel", - "NeuronQwen2VLDecoderLayer", - "NeuronQwen2VLAttention", - "NeuronQwen2VLMLP", - # MRoPE - "Qwen2VLRotaryEmbedding", - "apply_multimodal_rotary_pos_emb", - "apply_rotary_pos_emb_vision", - "rotate_half", -] diff --git a/contrib/models/Qwen2.5-VL-3B-Instruct/src/modeling_qwen2vl.py b/contrib/models/Qwen2.5-VL-3B-Instruct/src/modeling_qwen2vl.py deleted file mode 100644 index f2ab1e4d..00000000 --- a/contrib/models/Qwen2.5-VL-3B-Instruct/src/modeling_qwen2vl.py +++ /dev/null @@ -1,343 +0,0 @@ -# coding=utf-8 -# Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -PyTorch Qwen2.5-VL model for NeuronX Distributed Inference - -This implementation focuses on the text model with MRoPE (Multimodal Rotary Position Embeddings). -Vision integration can be added in future iterations. -""" - -from typing import List, Optional, Tuple, Type - -import torch -import torch.nn as nn -from neuronx_distributed.parallel_layers.layers import ( - ColumnParallelLinear, - ParallelEmbedding, - RowParallelLinear, -) -from neuronx_distributed.utils import cpu_mode -from transformers.models.llama.modeling_llama import LlamaRMSNorm - -from neuronx_distributed_inference.models.config import InferenceConfig -from neuronx_distributed_inference.models.model_base import ( - NeuronBaseForCausalLM, - NeuronBaseModel, -) -from neuronx_distributed_inference.models.llama.modeling_llama import NeuronLlamaMLP -from neuronx_distributed_inference.modules.attention.attention_base import NeuronAttentionBase -from neuronx_distributed_inference.modules.custom_calls import CustomRMSNorm - -# Import our MRoPE implementation -# Use absolute imports since this module may be loaded directly -import os -import sys -# Add current directory to path if not already there -_current_dir = os.path.dirname(os.path.abspath(__file__)) -if _current_dir not in sys.path: - sys.path.insert(0, _current_dir) - -from mrope import Qwen2VLRotaryEmbedding, apply_multimodal_rotary_pos_emb -from config_qwen2vl import Qwen2VLInferenceConfig - - -def get_rmsnorm_cls(): - """Get the appropriate RMSNorm implementation""" - # Use CustomRMSNorm for NXD, LlamaRMSNorm for CPU - return LlamaRMSNorm if cpu_mode() else CustomRMSNorm - - -class NeuronQwen2VLAttention(NeuronAttentionBase): - """ - Qwen2.5-VL attention implementation with MRoPE support - - Key features: - - GQA (Grouped Query Attention) with configurable num_key_value_heads - - MRoPE (Multimodal Rotary Position Embeddings) for 3D position encoding - - Bias in QKV projections, no bias in output projection - - Note: For initial implementation, we use standard RoPE instead of MRoPE - to simplify integration. MRoPE can be added in a future iteration. - """ - - def __init__(self, config: Qwen2VLInferenceConfig, layer_idx: Optional[int] = None): - # For now, use standard rotary embeddings like Qwen2 - # TODO: Add full MRoPE support in future iteration - from neuronx_distributed_inference.modules.attention.utils import RotaryEmbedding - - head_dim = config.hidden_size // config.num_attention_heads - rotary_emb = RotaryEmbedding( - dim=head_dim, - max_position_embeddings=config.max_position_embeddings, - base=config.rope_theta, - ) - - # Store layer idx for sliding window attention - self.layer_idx = layer_idx - self.config = config - - # Determine if this layer uses sliding window attention - # Qwen2.5-VL has layer_types configuration - if hasattr(config, 'layer_types') and layer_idx is not None: - sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None - else: - # Default: no sliding window for initial implementation - sliding_window = None - - super().__init__( - config=config, - hidden_size=config.hidden_size, - num_attention_heads=config.num_attention_heads, - num_key_value_heads=config.num_key_value_heads, - head_dim=head_dim, - qkv_bias=config.qkv_bias, - o_bias=config.o_bias, - rotary_emb=rotary_emb, - sliding_window=sliding_window, - ) - - # Store MRoPE section configuration for future use - self.mrope_section = config.mrope_section - - -class NeuronQwen2VLMLP(NeuronLlamaMLP): - """ - Qwen2.5-VL MLP implementation - - Uses SwiGLU activation same as LLaMA, so we can reuse NeuronLlamaMLP - Formula: down_proj(silu(gate_proj(x)) * up_proj(x)) - """ - pass - - -class NeuronQwen2VLDecoderLayer(nn.Module): - """ - Qwen2.5-VL decoder layer - - Structure: - - Input LayerNorm - - Self Attention with MRoPE - - Residual connection - - Post-attention LayerNorm - - MLP - - Residual connection - """ - - def __init__(self, config: Qwen2VLInferenceConfig, layer_idx: Optional[int] = None): - super().__init__() - self.hidden_size = config.hidden_size - - # Attention with MRoPE - self.self_attn = NeuronQwen2VLAttention(config, layer_idx=layer_idx) - - # MLP (reuse LLaMA MLP since it's the same SwiGLU) - self.mlp = NeuronQwen2VLMLP(config) - - # Layer norms - self.input_layernorm = get_rmsnorm_cls()( - config.hidden_size, - eps=config.rms_norm_eps, - ) - self.post_attention_layernorm = get_rmsnorm_cls()( - config.hidden_size, - eps=config.rms_norm_eps, - ) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - **kwargs, - ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: - """ - Forward pass for decoder layer - - Args: - hidden_states: Input tensor of shape (batch, seq_len, hidden_size) - attention_mask: Attention mask - position_ids: Position indices (can be 3D for MRoPE) - past_key_value: Cached key/value pairs - - Returns: - Tuple of (hidden_states, present_key_value, cos_cache, sin_cache, attn_weights) - """ - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - - # Self Attention - hidden_states, present_key_value, cos_cache, sin_cache = self.self_attn( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - **kwargs, - ) - hidden_states = residual + hidden_states - - # MLP - residual = hidden_states - hidden_states = self.post_attention_layernorm(hidden_states) - hidden_states = self.mlp(hidden_states)[0] - hidden_states = residual + hidden_states - - # Return format matching framework expectations - outputs = (hidden_states, present_key_value, cos_cache, sin_cache, None) - return outputs - - -class NeuronQwen2VLTextModel(NeuronBaseModel): - """ - Qwen2.5-VL text model (decoder-only) - - This is the core transformer model that processes text (and eventually multimodal) inputs. - """ - - def setup_attr_for_model(self, config: Qwen2VLInferenceConfig): - """Setup attributes required by the framework""" - self.on_device_sampling = config.neuron_config.on_device_sampling_config is not None - self.tp_degree = config.neuron_config.tp_degree - self.hidden_size = config.hidden_size - self.num_attention_heads = config.num_attention_heads - self.num_key_value_heads = config.num_key_value_heads - self.max_batch_size = config.neuron_config.max_batch_size - self.buckets = config.neuron_config.buckets - self.num_hidden_layers = config.num_hidden_layers - - def init_model(self, config: Qwen2VLInferenceConfig): - """Initialize model components""" - # Set padding_idx and vocab_size as attributes - self.padding_idx = config.pad_token_id if hasattr(config, 'pad_token_id') else None - self.vocab_size = config.vocab_size - - # Token embeddings - self.embed_tokens = ParallelEmbedding( - config.vocab_size, - config.hidden_size, - padding_idx=self.padding_idx, - dtype=config.neuron_config.torch_dtype, - shard_across_embedding=True, - pad=True, - ) - - # Decoder layers - self.layers = nn.ModuleList([ - NeuronQwen2VLDecoderLayer(config, layer_idx=i) - for i in range(config.num_hidden_layers) - ]) - - # Final layer norm - self.norm = get_rmsnorm_cls()( - config.hidden_size, - eps=config.rms_norm_eps, - ) - - # Language modeling head - self.lm_head = ColumnParallelLinear( - config.hidden_size, - config.vocab_size, - bias=False, - gather_output=True, - pad=True, - dtype=config.neuron_config.torch_dtype, - ) - - -class NeuronQwen2VLForConditionalGeneration(NeuronBaseForCausalLM): - """ - Qwen2.5-VL model for conditional generation (causal language modeling) - - This is the main entry point for the model, handling: - - Weight loading and conversion - - Language modeling head - - Generation interface - """ - - _model_cls = NeuronQwen2VLTextModel - - @staticmethod - def update_state_dict_for_tied_weights(state_dict): - """ - Update state dict to handle tied weights. - - Qwen2.5-VL ties the embedding and lm_head weights by default. - """ - # If lm_head.weight is not in the state dict (because of tied weights), - # copy it from embed_tokens - if "lm_head.weight" not in state_dict: - if "embed_tokens.weight" in state_dict: - state_dict["lm_head.weight"] = state_dict["embed_tokens.weight"].clone() - - @classmethod - def get_config_cls(cls): - """Return the configuration class for this model""" - return Qwen2VLInferenceConfig - - @staticmethod - def convert_hf_to_neuron_state_dict(state_dict: dict, config: InferenceConfig) -> dict: - """ - Convert HuggingFace state dict to NeuronX format - - Key mappings: - - model.embed_tokens.weight -> model.embed_tokens.weight - - model.layers.X.self_attn.q_proj.weight -> model.layers.X.self_attn.qkv_proj.q_proj.weight - - model.layers.X.self_attn.k_proj.weight -> model.layers.X.self_attn.qkv_proj.k_proj.weight - - model.layers.X.self_attn.v_proj.weight -> model.layers.X.self_attn.qkv_proj.v_proj.weight - - model.layers.X.self_attn.o_proj.weight -> model.layers.X.self_attn.o_proj.weight - - model.layers.X.mlp.gate_proj.weight -> model.layers.X.mlp.gate_proj.weight - - model.layers.X.mlp.up_proj.weight -> model.layers.X.mlp.up_proj.weight - - model.layers.X.mlp.down_proj.weight -> model.layers.X.mlp.down_proj.weight - - model.norm.weight -> model.norm.weight - - lm_head.weight -> lm_head.weight (if not tied) - """ - neuron_state_dict = {} - neuron_config = config.neuron_config - - # Map weights from HF format to Neuron format - for name, param in state_dict.items(): - # Skip visual components for now (text-only model) - if 'visual' in name: - continue - - # Handle attention QKV projections - if '.self_attn.q_proj.' in name: - new_name = name.replace('.self_attn.q_proj.', '.self_attn.qkv_proj.q_proj.') - neuron_state_dict[new_name] = param.clone() - elif '.self_attn.k_proj.' in name: - new_name = name.replace('.self_attn.k_proj.', '.self_attn.qkv_proj.k_proj.') - neuron_state_dict[new_name] = param.clone() - elif '.self_attn.v_proj.' in name: - new_name = name.replace('.self_attn.v_proj.', '.self_attn.qkv_proj.v_proj.') - neuron_state_dict[new_name] = param.clone() - else: - # Copy other weights as-is - neuron_state_dict[name] = param.clone() - - # Add rank utilities for tensor parallel support - if neuron_config.vocab_parallel: - neuron_state_dict["model.embed_tokens.rank_util.rank"] = torch.arange( - 0, neuron_config.local_ranks_size, dtype=torch.int32 - ) - - num_layers = config.num_hidden_layers - tp_degree = neuron_config.tp_degree - for i in range(num_layers): - neuron_state_dict[f"model.layers.{i}.self_attn.rank_util.rank"] = torch.arange( - 0, tp_degree, dtype=torch.int32 - ) - - return neuron_state_dict diff --git a/contrib/models/Qwen2.5-VL-3B-Instruct/test/__init__.py b/contrib/models/Qwen2.5-VL-3B-Instruct/test/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/contrib/models/Qwen2.5-VL-3B-Instruct/test/integration/__init__.py b/contrib/models/Qwen2.5-VL-3B-Instruct/test/integration/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/contrib/models/Qwen2.5-VL-3B-Instruct/test/integration/test_model.py b/contrib/models/Qwen2.5-VL-3B-Instruct/test/integration/test_model.py deleted file mode 100755 index 1b4eaf73..00000000 --- a/contrib/models/Qwen2.5-VL-3B-Instruct/test/integration/test_model.py +++ /dev/null @@ -1,236 +0,0 @@ -#!/usr/bin/env python3 -""" -Integration tests for Qwen2.5-VL-3B-Instruct NeuronX implementation. -""" - -import pytest -import torch -import json -from pathlib import Path -from transformers import AutoTokenizer, GenerationConfig - -from neuronx_distributed_inference.models.config import NeuronConfig -from neuronx_distributed_inference.utils.hf_adapter import load_pretrained_config - -# Import from src directory -import sys -sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) -from modeling_qwen2vl import * - - -# Test configuration -MODEL_PATH = "/home/ubuntu/models/Qwen2.5-VL-3B-Instruct/" -COMPILED_MODEL_PATH = "/home/ubuntu/neuron_models/Qwen2.5-VL-3B-Instruct/" - - -def load_neuron_config_from_compiled(compiled_path: str): - """Load neuron configuration from compiled model's neuron_config.json.""" - config_path = Path(compiled_path) / "neuron_config.json" - - if not config_path.exists(): - raise FileNotFoundError(f"neuron_config.json not found: {config_path}") - - with open(config_path) as f: - config_data = json.load(f) - - if "neuron_config" in config_data: - return config_data["neuron_config"] - else: - return config_data - - -def create_model_for_inference(compiled_path: str, model_path: str): - """Create model for inference using compiled neuron_config.""" - neuron_config_dict = load_neuron_config_from_compiled(compiled_path) - - dtype_str = neuron_config_dict.get('torch_dtype', 'torch.bfloat16') - if isinstance(dtype_str, str): - dtype = getattr(torch, dtype_str.split('.')[1]) if dtype_str.startswith('torch.') else torch.bfloat16 - else: - dtype = dtype_str - - neuron_config_kwargs = { - 'tp_degree': neuron_config_dict.get('tp_degree', 2), - 'batch_size': neuron_config_dict.get('batch_size', 1), - 'seq_len': neuron_config_dict.get('seq_len', 128), - 'torch_dtype': dtype, - } - - neuron_config = NeuronConfig(**neuron_config_kwargs) - - # This will use the imported model and config classes - # The actual class names will be determined at runtime - return None, neuron_config - - -def generate_with_neuron_model(model, input_ids, max_new_tokens: int): - """Generate tokens using manual forward pass loop.""" - generated_ids = input_ids.clone() - - for _ in range(max_new_tokens): - seq_len = generated_ids.shape[1] - position_ids = torch.arange(seq_len).unsqueeze(0).expand(generated_ids.shape[0], -1) - - with torch.no_grad(): - outputs = model(generated_ids, position_ids=position_ids) - - if hasattr(outputs, 'logits'): - logits = outputs.logits - elif isinstance(outputs, tuple): - logits = outputs[0] - else: - logits = outputs - - next_token_logits = logits[:, -1, :] - next_token = torch.argmax(next_token_logits, dim=-1).unsqueeze(-1) - generated_ids = torch.cat([generated_ids, next_token], dim=-1) - - return generated_ids - - -@pytest.fixture(scope="module") -def compiled_model(): - """Load pre-compiled model.""" - # Note: Actual implementation would load the specific model class - # This is a template that should be customized per model - return None - - -@pytest.fixture(scope="module") -def tokenizer(): - """Load tokenizer.""" - tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, padding_side="right", trust_remote_code=True) - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token - return tokenizer - - -def test_model_loads(compiled_model): - """Test that model loads successfully (smoke test).""" - assert compiled_model is not None - assert hasattr(compiled_model, 'config') - print("✓ Smoke test passed - Model loaded successfully") - - -def test_model_generates(compiled_model, tokenizer): - """Test that model can generate text.""" - prompt = "The capital of France is" - inputs = tokenizer(prompt, return_tensors="pt", padding=True) - - generated_ids = generate_with_neuron_model(compiled_model, inputs.input_ids, max_new_tokens=20) - output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True) - - assert len(output_text) > len(prompt), "Output should be longer than prompt" - print(f"✓ Generation test passed") - print(f" Output: {output_text}") - - -def test_output_coherence(compiled_model, tokenizer): - """Test that output is coherent (not gibberish).""" - prompt = "Hello, how are you?" - inputs = tokenizer(prompt, return_tensors="pt", padding=True) - - generated_ids = generate_with_neuron_model(compiled_model, inputs.input_ids, max_new_tokens=30) - output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True) - - # Coherence checks - assert len(output_text.split()) > 3, "Output should have multiple words" - assert not _is_repetitive(output_text), "Output should not be repetitive" - - print(f"✓ Coherence test passed") - print(f" Output: {output_text[:100]}...") - - - -def _is_repetitive(text: str, max_repeat: int = 5) -> bool: - """Check if text has excessive repetition.""" - words = text.split() - if len(words) < 10: - return False - - # Check for repeated words - for i in range(len(words) - max_repeat): - word = words[i] - if all(words[i+j] == word for j in range(max_repeat)): - return True - - # Check for repeated characters - new_text = text[-100:] if len(text) > 100 else text - if len(new_text) > 20: - char_counts = {} - for c in new_text: - char_counts[c] = char_counts.get(c, 0) + 1 - max_char_ratio = max(char_counts.values()) / len(new_text) - if max_char_ratio > 0.5: - return True - - return False - - -def test_performance_ttft(compiled_model, tokenizer): - """Test Time To First Token (TTFT) performance.""" - import time - - prompt = "Hello, how are you?" - inputs = tokenizer(prompt, return_tensors="pt", padding=True) - input_ids = inputs.input_ids - - # Warmup - for _ in range(3): - seq_len = input_ids.shape[1] - position_ids = torch.arange(seq_len).unsqueeze(0).expand(input_ids.shape[0], -1) - with torch.no_grad(): - _ = compiled_model(input_ids, position_ids=position_ids) - - # Measure TTFT - times = [] - for _ in range(10): - seq_len = input_ids.shape[1] - position_ids = torch.arange(seq_len).unsqueeze(0).expand(input_ids.shape[0], -1) - - start = time.perf_counter() - with torch.no_grad(): - _ = compiled_model(input_ids, position_ids=position_ids) - end = time.perf_counter() - - times.append((end - start) * 1000) # ms - - avg_ttft = sum(times) / len(times) - print(f"✓ TTFT: {avg_ttft:.2f}ms") - - - -def test_performance_throughput(compiled_model, tokenizer): - """Test token generation throughput.""" - import time - - prompt = "Hello" - inputs = tokenizer(prompt, return_tensors="pt", padding=True) - input_ids = inputs.input_ids - num_tokens = 50 - - # Warmup - _ = generate_with_neuron_model(compiled_model, input_ids, max_new_tokens=5) - - # Measure throughput - start = time.perf_counter() - _ = generate_with_neuron_model(compiled_model, input_ids, max_new_tokens=num_tokens) - end = time.perf_counter() - - total_time = end - start - throughput = num_tokens / total_time - print(f"✓ Throughput: {throughput:.2f} tok/s") - - - -if __name__ == "__main__": - print("="*80) - print("Qwen2.5-VL-3B-Instruct Integration Tests") - print("="*80) - - print("\nNote: This is a template test file.") - print("For actual model testing, customize the model loading logic.") - - print("\n" + "="*80) - print("✓ Template structure verified!") - print("="*80) diff --git a/contrib/models/Qwen2.5-VL-3B-Instruct/test/unit/__init__.py b/contrib/models/Qwen2.5-VL-3B-Instruct/test/unit/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/contrib/models/Qwen2.5-VL-7B-Instruct/README.md b/contrib/models/Qwen2.5-VL-7B-Instruct/README.md new file mode 100644 index 00000000..143138b1 --- /dev/null +++ b/contrib/models/Qwen2.5-VL-7B-Instruct/README.md @@ -0,0 +1,319 @@ +# Contrib Model: Qwen2.5-VL-7B-Instruct + +Full vision-language implementation of Qwen2.5-VL-7B-Instruct on NeuronX Distributed Inference. Includes both the text decoder and the vision encoder with windowed attention. + +> **Note:** Unlike existing Qwen2.5-VL contrib entries (3B, 32B) which only support the text backbone, this implementation provides **complete vision-language inference** including image understanding. + +## Model Information + +- **HuggingFace ID:** `Qwen/Qwen2.5-VL-7B-Instruct` +- **Model Type:** Vision-Language (encoder-decoder with ViT vision encoder) +- **Architecture:** Qwen2.5-VL (text backbone identical to Qwen2-VL) +- **Parameters:** 7B (text) + 675M (vision) = ~8.3B total +- **License:** Check [HuggingFace model card](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) + +## Architecture Details + +### Text Decoder +- **Layers:** 28 +- **Hidden Size:** 3584 +- **Attention Heads:** 28 (Q) / 4 (KV) -- GQA +- **Intermediate Size:** 18944 (SwiGLU MLP) +- **Vocabulary:** 152064 +- **Max Position Embeddings:** 128000 +- **RoPE:** M-RoPE with sections [16, 24, 24] (temporal, height, width) +- **QKV Bias:** True, O Bias: False + +### Vision Encoder +- **Layers:** 32 +- **Hidden Size:** 1280 +- **Attention Heads:** 16 +- **MLP:** Gated SwiGLU with bias (intermediate_size=3420) +- **Normalization:** RMSNorm (not LayerNorm) +- **Attention:** Hybrid windowed (28 layers, 4x4 windows) + global (4 layers: [7,15,23,31]) +- **Patch Size:** 14x14, Temporal Patch Size: 2 +- **Spatial Merge Size:** 2 + +## Validation Results + +**Validated:** 2026-03-29 +**Instance:** trn2.3xlarge (LNC=2, 4 logical cores), trn2.48xlarge (72B) +**SDK:** Neuron SDK 2.28 + +### Test Results + +| Test | Status | Result | +|------|--------|--------| +| Smoke Test | PASS | Model loads from compiled artifacts | +| Text-only Generation | PASS | "The capital of France is Paris." (exact CPU match) | +| Logit Validation | PASS | Greedy output matches HF CPU reference | +| VL Generation | PASS | Correctly identifies shapes/colors in synthetic images | +| Multi-resolution VL | PASS | 224x224, 448x448, 672x672, 640x480 all working | +| vllm-neuron API | PASS | 6/6 OpenAI-compatible API tests passed | +| Multi-bucket CTE | PASS | 7/7 tests pass with optimized bucketing config | + +### Performance Metrics (TP=4, trn2.3xlarge, optimized config) + +Configuration: Multi-bucket CTE [512, 1024, 2048, 4096], vision flash attention enabled. + +| Metric | Text-only | Vision-Language | +|--------|-----------|-----------------| +| Token Generation | 86.4 tok/s | 86.7 tok/s | +| TPOT | 11.57 ms | 11.57 ms | +| HBM per Core | 4.2 GB | 4.2 GB | +| Compile Time | 81.6 s (5 NEFFs) | 81.6 s (text) + ~30 s (vision) | +| Model Load Time | 12-14 s | 12-14 s | + +### TTFT by Input Length (Multi-bucket CTE) + +| Input Tokens | CTE Bucket Used | TTFT (P50) | +|-------------|-----------------|------------| +| ~115 | 512 | **38.2 ms** | +| ~484 | 512 | **38.3 ms** | +| ~943 | 1024 | **57.6 ms** | +| ~1861 | 2048 | **95.0 ms** | +| ~3175 | 4096 | **182.8 ms** | + +Multi-bucket CTE provides **4.8x TTFT improvement** for short inputs vs single-bucket (38 ms vs 183 ms). + +### Comparison with Qwen3-VL-8B (TP=4, trn2.3xlarge) + +| Metric | Qwen2.5-VL-7B | Qwen3-VL-8B | Difference | +|--------|---------------|-------------|------------| +| TKG throughput | 86.4 tok/s | 76.8 tok/s | **+12.5%** | +| TTFT (short input) | 38.2 ms | ~200 ms | **~5x faster** | +| HBM per core | 4.2 GB | ~5 GB | 19% smaller | + +### NKI Kernel Compatibility + +**Text decoder:** + +| Kernel | Status | Notes | +|--------|--------|-------| +| `qkv_kernel_enabled` | PASS | Fused RMSNorm+QKV ISA kernel, supports bias | +| `attn_kernel_enabled` | PASS | CTE flash attention NKI kernel | +| `attn_tkg_nki_kernel_enabled` | PASS | TKG NKI attention, 27.6 tok/s, exact match | +| `mlp_kernel_enabled` | FAIL | SBUF OOM: intermediate_size/TP = 4736 > 4096 | +| `attn_tkg_builtin_kernel_enabled` | FAIL | M-RoPE 3D rotary incompatible | +| `out_proj_kernel_enabled` | FAIL | hidden_size=3584 not divisible by 1024 | + +**Vision encoder:** + +| Kernel | Status | Notes | +|--------|--------|-------| +| `attn_kernel_enabled` | PASS | Flash attention for bidirectional vision | +| `qkv_kernel_enabled` | FAIL | Fused RMSNorm+QKV: eps type mismatch with vision RMSNorm | + +## Usage + +### Text-only Inference + +```python +import torch +from neuronx_distributed_inference.models.config import NeuronConfig +from neuronx_distributed_inference.utils.hf_adapter import ( + load_pretrained_config, + HuggingFaceGenerationAdapter, +) +from neuronx_distributed_inference.modules.generation.sampling import prepare_sampling_params +from transformers import AutoProcessor, GenerationConfig + +from src.modeling_qwen2_5_vl import NeuronQwen2_5_VLForCausalLM, Qwen2_5_VLInferenceConfig + +model_path = "/path/to/Qwen2.5-VL-7B-Instruct" +compiled_path = "/path/to/compiled" + +# Configure +text_neuron_config = NeuronConfig( + batch_size=1, ctx_batch_size=1, seq_len=4096, + tp_degree=4, world_size=4, + torch_dtype=torch.bfloat16, + fused_qkv=True, qkv_kernel_enabled=True, + attn_kernel_enabled=True, attn_tkg_nki_kernel_enabled=True, + logical_neuron_cores=2, cc_pipeline_tiling_factor=2, + cast_type="as-declared", save_sharded_checkpoint=True, + enable_bucketing=True, # Multi-bucket CTE for TTFT optimization + context_encoding_buckets=[512, 1024, 2048, 4096], # Min 512 for TKG NKI compat + token_generation_buckets=[4096], # Single TKG bucket at full seq_len +) +vision_neuron_config = NeuronConfig( + batch_size=1, seq_len=4096, + tp_degree=4, world_size=4, + torch_dtype=torch.bfloat16, + fused_qkv=True, enable_bucketing=True, buckets=[2], + attn_kernel_enabled=True, # Flash attention for bidirectional vision + logical_neuron_cores=2, cc_pipeline_tiling_factor=2, + cast_type="as-declared", save_sharded_checkpoint=True, +) +config = Qwen2_5_VLInferenceConfig( + text_neuron_config=text_neuron_config, + vision_neuron_config=vision_neuron_config, + load_config=load_pretrained_config(model_path), +) + +# Compile (first time only) +model = NeuronQwen2_5_VLForCausalLM(model_path=model_path, config=config) +model.compile(compiled_path) + +# Load compiled model +model.load(compiled_path) +adapter = HuggingFaceGenerationAdapter(model) +processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) + +# Generate text +messages = [{"role": "user", "content": [{"type": "text", "text": "What is the capital of France?"}]}] +text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) +inputs = processor(text=[text], return_tensors="pt") + +gen_config = GenerationConfig(do_sample=False, eos_token_id=[151645], pad_token_id=151645) +sampling = prepare_sampling_params(batch_size=1, top_k=[1], top_p=[1.0], temperature=[1.0]) + +with torch.no_grad(): + output = adapter.generate( + inputs.input_ids, attention_mask=inputs.attention_mask, + sampling_params=sampling, generation_config=gen_config, max_new_tokens=64, + ) +print(processor.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)) +# -> "The capital of France is Paris." +``` + +### Vision-Language Inference + +```python +from PIL import Image + +image = Image.open("photo.jpg") +messages = [{"role": "user", "content": [ + {"type": "image", "image": image}, + {"type": "text", "text": "Describe this image."}, +]}] +text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) +inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt") + +with torch.no_grad(): + output = adapter.generate( + inputs.input_ids, attention_mask=inputs.attention_mask, + pixel_values=inputs.pixel_values, image_grid_thw=inputs.image_grid_thw, + sampling_params=sampling, generation_config=gen_config, max_new_tokens=128, + ) +print(processor.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)) +``` + +### vllm-neuron Serving + +**Validated on both vllm-neuron 0.4.1 and 0.5.0 (6/6 API tests passed on each).** + +#### vllm-neuron 0.5.0 (recommended) + +```bash +# Install: git clone --branch release-0.5.0 https://github.com/vllm-project/vllm-neuron.git +# cd vllm-neuron && pip install --extra-index-url=https://pip.repos.neuron.amazonaws.com -e . + +# Apply patches (Qwen2.5-VL is not natively supported in 0.5.0): +python patch_vllm_050_qwen25vl.py --vllm-dir /path/to/vllm-neuron + +# Serve: +NEURON_COMPILED_ARTIFACTS=/path/to/compiled \ +PYTHONPATH=/path/to/qwen25vl:$PYTHONPATH \ +vllm serve /path/to/Qwen2.5-VL-7B-Instruct \ + --tensor-parallel-size 4 \ + --max-model-len 4096 \ + --max-num-seqs 1 \ + --port 8000 \ + --no-enable-prefix-caching +``` + +#### vllm-neuron 0.4.1 + +```bash +NEURON_COMPILED_ARTIFACTS=/path/to/compiled \ +vllm serve Qwen/Qwen2.5-VL-7B-Instruct \ + --tensor-parallel-size 4 \ + --max-model-len 4096 \ + --max-num-seqs 1 \ + --port 8000 \ + --no-enable-prefix-caching +``` + +Both versions require 4 file patches (constants.py, model_loader.py, model_runner.py, NxDI constants.py). Patch scripts: +- **0.5.0**: `patch_vllm_050_qwen25vl.py` (supports `--vllm-dir`, `--nxdi-constants`, `--qwen25vl-dir`) +- **0.4.1**: `patch_vllm_qwen25vl.py` + +## Compatibility Matrix + +| Instance Type | TP=4 | TP=2 | +|--------------|------|------| +| trn2.3xlarge (LNC=2) | **Validated** (86.4 tok/s, 38ms TTFT) | Validated (28.1 tok/s) | +| trn2.48xlarge | Not tested | Not tested | +| trn1.32xlarge | Not tested | Not tested | + +### Multi-Size Support + +The same code works for all Qwen2.5-VL sizes (config-driven). Tested: + +| Model | Instance | TP | TKG tok/s | Compile | Weights/Core | Notes | +|-------|----------|----|-----------|---------|-------------|-------| +| **7B** | trn2.3xlarge | 4 | 86.4 | 81.6s | 4.2 GB | Primary target | +| **3B** | trn2.3xlarge | 4 | 104.3 | 56.4s | 2.1 GB | `tie_word_embeddings=True` | +| **72B** | trn2.48xlarge | 32 | 44.3 | 508.4s | ~4.5 GB | 80 layers, 64 heads, 8 KV heads | + +**3B notes**: The 3B model uses tied weights (`lm_head` = `embed_tokens`). The `update_state_dict_for_tied_weights` override handles this automatically. MLP NKI kernel compiles for 3B (`intermediate/TP=2752`) but is 13% slower than baseline -- not recommended for 3B. + +**72B notes**: Requires trn2.48xlarge with TP=32 (146 GB BF16 weights). Compilation takes ~8.5 minutes. Vision encoder is identical to 3B/7B except `out_hidden_size=8192` and `intermediate_size=3456`. Steady-state TKG is 44.3 tok/s. VL inference works (33.8 tok/s with 672x672 image). + +## Implementation Notes + +### Vision Encoder Differences from Qwen2-VL + +The Qwen2.5-VL vision encoder differs from Qwen2-VL in several ways: +1. **RMSNorm** instead of LayerNorm +2. **Gated SwiGLU MLP** with bias=True (unique -- neither Qwen2-VL nor Qwen3-VL has this in vision) +3. **Windowed attention** with `get_window_index()` partitioning tokens into 4x4 windows +4. **Hybrid attention**: 28 windowed layers + 4 global layers at positions [7, 15, 23, 31] + +### Vision Buckets + +Vision buckets represent the **number of images**, not sequence lengths. Default is `buckets=[2]` which provides headroom for vllm image preprocessing that may resize images beyond the compiled `pixels_per_image`. + +### Key Files + +| File | Description | Lines | +|------|-------------|-------| +| `modeling_qwen2_5_vl.py` | VL orchestrator (config, forward, state dict) | ~370 | +| `modeling_qwen2_5_vl_text.py` | Text decoder (attention, MLP, decoder layers) | ~370 | +| `modeling_qwen2_5_vl_vision.py` | Vision encoder (windowed attn, SwiGLU, merger) | ~760 | + +## Example Checkpoints + +* [Qwen/Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) -- Primary target (7B) +* [Qwen/Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct) -- Validated (3B) +* [Qwen/Qwen2.5-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct) -- Validated (72B, requires trn2.48xlarge) + +## Testing + +```bash +# Set paths (adjust for your environment) +export QWEN25VL_MODEL_PATH=/mnt/models/Qwen2.5-VL-7B-Instruct +export QWEN25VL_COMPILED_PATH=/mnt/models/qwen25vl_compiled + +# Run all tests +pytest test/integration/test_model.py -v --capture=tee-sys + +# Run specific test +pytest test/integration/test_model.py::test_logit_validation -v +``` + +## Known Limitations + +1. **Batch size > 1** requires the VLM batch>1 fix from branch [`fix/qwen3-vl-batch-size-gt1-v2`](https://github.com/jimburtoft/neuronx-distributed-inference/tree/fix/qwen3-vl-batch-size-gt1-v2) (3 patches to NxDI `image_to_text_model_wrapper.py`). Without it, batch>1 crashes. +2. **MLP kernel** is not compatible (intermediate_size/TP = 4736 exceeds SBUF threshold ~4096). +3. **Builtin TKG kernel** is not compatible with M-RoPE 3D rotary embeddings. +4. **Vision qkv_kernel** is not compatible (fused RMSNorm+QKV ISA kernel fails with vision encoder's RMSNorm epsilon type). +5. **Multi-bucket CTE minimum bucket**: Must be >= 512. Auto-generated small buckets (e.g., 128) cause TKG NKI kernel assertion failure (`sharded_S_ctx % 128 == 0` with LNC=2). Always set `context_encoding_buckets` explicitly. +6. **Video input** is not tested (the model architecturally supports it via temporal patches). + +## Maintainer + +Jim Burtoft, AWS + +**Last Updated:** 2026-03-29 diff --git a/contrib/models/Qwen2.5-VL-7B-Instruct/patch_vllm_050_qwen25vl.py b/contrib/models/Qwen2.5-VL-7B-Instruct/patch_vllm_050_qwen25vl.py new file mode 100644 index 00000000..d42395b2 --- /dev/null +++ b/contrib/models/Qwen2.5-VL-7B-Instruct/patch_vllm_050_qwen25vl.py @@ -0,0 +1,271 @@ +""" +Patch vllm-neuron 0.5.0 to add Qwen2.5-VL support. + +vllm-neuron 0.5.0 is a pip-installable plugin (not a fork). +Install it first: git clone --branch release-0.5.0 https://github.com/vllm-project/vllm-neuron.git + pip install -e . + +This script patches 4 files: +1. vllm_neuron/worker/constants.py - Add to NEURON_MULTI_MODAL_MODELS +2. NxDI constants.py - Register qwen2_5_vl in MODEL_TYPES +3. vllm_neuron/worker/neuronx_distributed_model_loader.py - Add wrapper class + dispatches +4. vllm_neuron/worker/neuronx_distributed_model_runner.py - Add multimodal data routing + +Usage: + python patch_vllm_050_qwen25vl.py [--vllm-dir /path/to/vllm-neuron] [--nxdi-constants /path/to/constants.py] + + If --vllm-dir is not specified, auto-detects from vllm_neuron package location. + If --nxdi-constants is not specified, auto-detects from neuronx_distributed_inference package location. +""" + +import argparse +import os +import sys + + +def patch_file(path, check_string, old_string, new_string, description): + """Patch a file by replacing old_string with new_string.""" + if not os.path.exists(path): + print(f"ERROR: File not found: {path} - {description}") + return False + + with open(path) as f: + content = f.read() + + if check_string in content: + print(f"SKIPPED (already patched): {path} - {description}") + return False + + if old_string not in content: + print(f"ERROR: Could not find target string in {path} for: {description}") + print(f" Looking for: {repr(old_string[:100])}") + return False + + content = content.replace(old_string, new_string, 1) + with open(path, "w") as f: + f.write(content) + print(f"PATCHED: {path} - {description}") + return True + + +def append_to_file(path, check_string, text, description): + """Append text to a file if check_string is not already present.""" + if not os.path.exists(path): + print(f"ERROR: File not found: {path} - {description}") + return False + + with open(path) as f: + content = f.read() + + if check_string in content: + print(f"SKIPPED (already patched): {path} - {description}") + return False + + content += text + with open(path, "w") as f: + f.write(content) + print(f"PATCHED: {path} - {description}") + return True + + +def find_vllm_neuron_dir(): + """Auto-detect vllm-neuron installation directory.""" + try: + import vllm_neuron + + return os.path.dirname(os.path.dirname(vllm_neuron.__file__)) + except ImportError: + # Try common locations + for path in [ + os.path.expanduser("~/vllm-neuron"), + "/home/ubuntu/vllm-neuron", + ]: + if os.path.exists( + os.path.join(path, "vllm_neuron", "worker", "constants.py") + ): + return path + return None + + +def find_nxdi_constants(): + """Auto-detect NxDI constants.py location.""" + try: + import neuronx_distributed_inference.utils.constants as c + + return c.__file__ + except ImportError: + pass + # Fallback: search common venv locations + for venv in [ + "/opt/aws_neuronx_venv_pytorch_inference_vllm_0_13", + ]: + path = os.path.join(venv, "lib") + if os.path.exists(path): + for root, dirs, files in os.walk(path): + if ( + "constants.py" in files + and "neuronx_distributed_inference/utils" in root + ): + return os.path.join(root, "constants.py") + return None + + +def main(): + parser = argparse.ArgumentParser( + description="Patch vllm-neuron 0.5.0 for Qwen2.5-VL support" + ) + parser.add_argument("--vllm-dir", help="Path to vllm-neuron repo/install directory") + parser.add_argument("--nxdi-constants", help="Path to NxDI constants.py") + parser.add_argument( + "--qwen25vl-dir", + default="/home/ubuntu/qwen25vl", + help="Path to qwen2.5-vl model code on instance (default: /home/ubuntu/qwen25vl)", + ) + args = parser.parse_args() + + # Resolve paths + vllm_dir = args.vllm_dir or find_vllm_neuron_dir() + if vllm_dir is None: + print("ERROR: Could not find vllm-neuron installation. Use --vllm-dir.") + sys.exit(1) + + nxdi_constants = args.nxdi_constants or find_nxdi_constants() + if nxdi_constants is None: + print("ERROR: Could not find NxDI constants.py. Use --nxdi-constants.") + sys.exit(1) + + VLLM_CONSTANTS = os.path.join(vllm_dir, "vllm_neuron", "worker", "constants.py") + MODEL_LOADER = os.path.join( + vllm_dir, "vllm_neuron", "worker", "neuronx_distributed_model_loader.py" + ) + MODEL_RUNNER = os.path.join( + vllm_dir, "vllm_neuron", "worker", "neuronx_distributed_model_runner.py" + ) + NXDI_CONSTANTS = nxdi_constants + qwen25vl_dir = args.qwen25vl_dir + + print(f"vllm-neuron dir: {vllm_dir}") + print(f"NxDI constants: {NXDI_CONSTANTS}") + print(f"Qwen2.5-VL dir: {qwen25vl_dir}") + print() + + patched = 0 + skipped = 0 + errors = 0 + + # === 1. Add to NEURON_MULTI_MODAL_MODELS === + result = patch_file( + VLLM_CONSTANTS, + check_string="Qwen2_5_VLForConditionalGeneration", + old_string=' "Qwen3VLForConditionalGeneration",\n]', + new_string=' "Qwen2_5_VLForConditionalGeneration",\n "Qwen3VLForConditionalGeneration",\n]', + description="Add Qwen2_5_VL to NEURON_MULTI_MODAL_MODELS", + ) + patched += 1 if result else 0 + + # === 2. Register in NxDI MODEL_TYPES === + registration_block = f""" + +# --- Qwen2.5-VL registration (patched for qwen2.5-vl project) --- +import sys as _sys +if '{qwen25vl_dir}' not in _sys.path: + _sys.path.insert(0, '{qwen25vl_dir}') +try: + from src.modeling_qwen2_5_vl import NeuronQwen2_5_VLForCausalLM as _Qwen25VL_CausalLM + from src.modeling_qwen2_5_vl_vision import NeuronQwen2_5_VLForImageEncoding as _Qwen25VL_ImageEnc + MODEL_TYPES["qwen2_5_vl"] = {{ + "causal-lm": _Qwen25VL_CausalLM, + "image-encoding": _Qwen25VL_ImageEnc, + }} +except ImportError as _e: + import logging as _logging + _logging.getLogger("Neuron").warning(f"Qwen2.5-VL registration failed: {{_e}}") +# --- End Qwen2.5-VL registration --- +""" + result = append_to_file( + NXDI_CONSTANTS, + check_string="qwen2_5_vl", + text=registration_block, + description="Register qwen2_5_vl in MODEL_TYPES", + ) + patched += 1 if result else 0 + + # === 3a. Add NeuronQwen2_5VLForCausalLM wrapper class === + wrapper_class = ''' + +class NeuronQwen2_5VLForCausalLM(NeuronQwen2VLForCausalLM): + """vLLM wrapper for Qwen2.5-VL. Inherits execute_model/forward from Qwen2-VL wrapper.""" + def _save_pretrained_model(self, model_name: str): + from transformers import Qwen2_5_VLForConditionalGeneration + hf_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_name) + saved_path = os.path.join("local-models", model_name) + hf_model.save_pretrained(saved_path) + return saved_path + + +''' + + old_after_qwen3 = """class NeuronQwen3VLForCausalLM(NeuronQwen2VLForCausalLM): + def _save_pretrained_model(self, model_name: str): + from transformers import Qwen3VLForConditionalGeneration + + hf_model = Qwen3VLForConditionalGeneration.from_pretrained(model_name) + saved_path = os.path.join("local-models", model_name) + hf_model.save_pretrained(saved_path) + return saved_path""" + + new_after_qwen3 = old_after_qwen3 + wrapper_class + + result = patch_file( + MODEL_LOADER, + check_string="NeuronQwen2_5VLForCausalLM", + old_string=old_after_qwen3, + new_string=new_after_qwen3, + description="Add NeuronQwen2_5VLForCausalLM wrapper class", + ) + patched += 1 if result else 0 + + # === 3b. Add dispatch in get_neuron_model === + old_dispatch = """ elif architecture == "Qwen3VLForConditionalGeneration": + model = NeuronQwen3VLForCausalLM(model_config.hf_config) + else:""" + new_dispatch = """ elif architecture == "Qwen2_5_VLForConditionalGeneration": + model = NeuronQwen2_5VLForCausalLM(model_config.hf_config) + elif architecture == "Qwen3VLForConditionalGeneration": + model = NeuronQwen3VLForCausalLM(model_config.hf_config) + else:""" + + result = patch_file( + MODEL_LOADER, + check_string='architecture == "Qwen2_5_VLForConditionalGeneration"', + old_string=old_dispatch, + new_string=new_dispatch, + description="Add Qwen2.5-VL dispatch in get_neuron_model", + ) + patched += 1 if result else 0 + + # === 4. Add multimodal data routing === + old_mm_dispatch = """ elif self.model.model.config.model_type == "qwen3_vl": + # Qwen3-VL uses the same processing as Qwen2-VL + mm_kwargs = self._process_multi_modal_data_neuron_qwen2_vl(mm_kwargs)""" + new_mm_dispatch = """ elif self.model.model.config.model_type == "qwen2_5_vl": + # Qwen2.5-VL uses the same processing as Qwen2-VL + mm_kwargs = self._process_multi_modal_data_neuron_qwen2_vl(mm_kwargs) + elif self.model.model.config.model_type == "qwen3_vl": + # Qwen3-VL uses the same processing as Qwen2-VL + mm_kwargs = self._process_multi_modal_data_neuron_qwen2_vl(mm_kwargs)""" + + result = patch_file( + MODEL_RUNNER, + check_string="qwen2_5_vl", + old_string=old_mm_dispatch, + new_string=new_mm_dispatch, + description="Add qwen2_5_vl multimodal data routing", + ) + patched += 1 if result else 0 + + print(f"\nDone! {patched} patches applied.") + + +if __name__ == "__main__": + main() diff --git a/contrib/models/Qwen2.5-VL-7B-Instruct/patch_vllm_qwen25vl.py b/contrib/models/Qwen2.5-VL-7B-Instruct/patch_vllm_qwen25vl.py new file mode 100644 index 00000000..c22062a6 --- /dev/null +++ b/contrib/models/Qwen2.5-VL-7B-Instruct/patch_vllm_qwen25vl.py @@ -0,0 +1,170 @@ +""" +Patch vllm-neuron to add Qwen2.5-VL support. +Modifies 4 files: +1. vllm_neuron/worker/constants.py - Add to NEURON_MULTI_MODAL_MODELS +2. NxDI constants.py - Add to MODEL_TYPES +3. vllm_neuron/worker/neuronx_distributed_model_loader.py - Add wrapper class + dispatches +4. vllm_neuron/worker/neuronx_distributed_model_runner.py - Add multimodal data routing +""" + +import os +import sys + + +def patch_file(path, check_string, old_string, new_string, description): + """Patch a file by replacing old_string with new_string.""" + with open(path) as f: + content = f.read() + + if check_string in content: + print(f"SKIPPED (already patched): {path} - {description}") + return False + + if old_string not in content: + print(f"ERROR: Could not find target string in {path} for: {description}") + print(f" Looking for: {repr(old_string[:100])}") + return False + + content = content.replace(old_string, new_string, 1) + with open(path, "w") as f: + f.write(content) + print(f"PATCHED: {path} - {description}") + return True + + +def append_to_file(path, check_string, text, description): + """Append text to a file if check_string is not already present.""" + with open(path) as f: + content = f.read() + + if check_string in content: + print(f"SKIPPED (already patched): {path} - {description}") + return False + + content += text + with open(path, "w") as f: + f.write(content) + print(f"PATCHED: {path} - {description}") + return True + + +# === File paths === +VLLM_CONSTANTS = "/vllm/vllm_neuron/worker/constants.py" +NXDI_CONSTANTS = "/opt/aws_neuronx_venv_pytorch_inference_vllm_0_13/lib/python3.12/site-packages/neuronx_distributed_inference/utils/constants.py" +MODEL_LOADER = "/vllm/vllm_neuron/worker/neuronx_distributed_model_loader.py" +MODEL_RUNNER = "/vllm/vllm_neuron/worker/neuronx_distributed_model_runner.py" + + +# === 1. Add to NEURON_MULTI_MODAL_MODELS === +patch_file( + VLLM_CONSTANTS, + check_string="Qwen2_5_VLForConditionalGeneration", + old_string=' "Qwen3VLForConditionalGeneration",\n]', + new_string=' "Qwen2_5_VLForConditionalGeneration",\n "Qwen3VLForConditionalGeneration",\n]', + description="Add Qwen2_5_VL to NEURON_MULTI_MODAL_MODELS", +) + + +# === 2. Register in NxDI MODEL_TYPES === +registration_block = """ + +# --- Qwen2.5-VL registration (patched for qwen2.5-vl project) --- +import sys as _sys +if '/home/ubuntu/qwen25vl' not in _sys.path: + _sys.path.insert(0, '/home/ubuntu/qwen25vl') +try: + from src.modeling_qwen2_5_vl import NeuronQwen2_5_VLForCausalLM as _Qwen25VL_CausalLM + from src.modeling_qwen2_5_vl_vision import NeuronQwen2_5_VLForImageEncoding as _Qwen25VL_ImageEnc + MODEL_TYPES["qwen2_5_vl"] = { + "causal-lm": _Qwen25VL_CausalLM, + "image-encoding": _Qwen25VL_ImageEnc, + } +except ImportError as _e: + import logging as _logging + _logging.getLogger("Neuron").warning(f"Qwen2.5-VL registration failed: {_e}") +# --- End Qwen2.5-VL registration --- +""" +append_to_file( + NXDI_CONSTANTS, + check_string="qwen2_5_vl", + text=registration_block, + description="Register qwen2_5_vl in MODEL_TYPES", +) + + +# === 3a. Add NeuronQwen2_5VLForCausalLM wrapper class === +wrapper_class = """ + +class NeuronQwen2_5VLForCausalLM(NeuronQwen2VLForCausalLM): + \"\"\"vLLM wrapper for Qwen2.5-VL. Inherits execute_model/forward from Qwen2-VL wrapper.\"\"\" + def _save_pretrained_model(self, model_name: str): + from transformers import Qwen2_5_VLForConditionalGeneration + hf_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_name) + saved_path = os.path.join("local-models", model_name) + hf_model.save_pretrained(saved_path) + return saved_path + + +""" + +# Find exact insertion point: after NeuronQwen3VLForCausalLM class ends +old_after_qwen3 = """class NeuronQwen3VLForCausalLM(NeuronQwen2VLForCausalLM): + def _save_pretrained_model(self, model_name: str): + from transformers import Qwen3VLForConditionalGeneration + + hf_model = Qwen3VLForConditionalGeneration.from_pretrained(model_name) + saved_path = os.path.join("local-models", model_name) + hf_model.save_pretrained(saved_path) + return saved_path""" + +new_after_qwen3 = old_after_qwen3 + wrapper_class + +patch_file( + MODEL_LOADER, + check_string="NeuronQwen2_5VLForCausalLM", + old_string=old_after_qwen3, + new_string=new_after_qwen3, + description="Add NeuronQwen2_5VLForCausalLM wrapper class", +) + + +# === 3b. Add dispatch in get_neuron_model === +old_dispatch = """ elif architecture == "Qwen3VLForConditionalGeneration": + model = NeuronQwen3VLForCausalLM(model_config.hf_config) + else:""" +new_dispatch = """ elif architecture == "Qwen2_5_VLForConditionalGeneration": + model = NeuronQwen2_5VLForCausalLM(model_config.hf_config) + elif architecture == "Qwen3VLForConditionalGeneration": + model = NeuronQwen3VLForCausalLM(model_config.hf_config) + else:""" + +patch_file( + MODEL_LOADER, + check_string="Qwen2_5_VLForConditionalGeneration", + old_string=old_dispatch, + new_string=new_dispatch, + description="Add Qwen2.5-VL dispatch in get_neuron_model", +) + + +# === 4. Add multimodal data routing === +old_mm_dispatch = """ elif self.model.model.config.model_type == "qwen3_vl": + # Qwen3-VL uses the same processing as Qwen2-VL + mm_kwargs = self._process_multi_modal_data_neuron_qwen2_vl(mm_kwargs)""" +new_mm_dispatch = """ elif self.model.model.config.model_type == "qwen2_5_vl": + # Qwen2.5-VL uses the same processing as Qwen2-VL + mm_kwargs = self._process_multi_modal_data_neuron_qwen2_vl(mm_kwargs) + elif self.model.model.config.model_type == "qwen3_vl": + # Qwen3-VL uses the same processing as Qwen2-VL + mm_kwargs = self._process_multi_modal_data_neuron_qwen2_vl(mm_kwargs)""" + +patch_file( + MODEL_RUNNER, + check_string="qwen2_5_vl", + old_string=old_mm_dispatch, + new_string=new_mm_dispatch, + description="Add qwen2_5_vl multimodal data routing", +) + + +print("\nAll patches applied!") diff --git a/contrib/models/Qwen2.5-VL-7B-Instruct/src/__init__.py b/contrib/models/Qwen2.5-VL-7B-Instruct/src/__init__.py new file mode 100644 index 00000000..33c34294 --- /dev/null +++ b/contrib/models/Qwen2.5-VL-7B-Instruct/src/__init__.py @@ -0,0 +1,29 @@ +# Qwen2.5-VL-7B NxDI Full Vision-Language Implementation +# +# This contrib provides complete Qwen2.5-VL support on Neuron, including: +# - Text decoder (identical to Qwen2-VL backbone) +# - Vision encoder with windowed attention + SwiGLU MLP + RMSNorm +# - VL orchestrator with ImageToTextInferenceConfig +# +# Usage: +# from src.modeling_qwen2_5_vl import ( +# NeuronQwen2_5_VLForCausalLM, +# Qwen2_5_VLInferenceConfig, +# ) +# from src.modeling_qwen2_5_vl_vision import NeuronQwen2_5_VLForImageEncoding + +from .modeling_qwen2_5_vl import ( + NeuronQwen2_5_VLForCausalLM, + Qwen2_5_VLInferenceConfig, +) +from .modeling_qwen2_5_vl_text import ( + NeuronQwen2_5_VLTextForCausalLM, +) +from .modeling_qwen2_5_vl_vision import NeuronQwen2_5_VLForImageEncoding + +__all__ = [ + "NeuronQwen2_5_VLForCausalLM", + "Qwen2_5_VLInferenceConfig", + "NeuronQwen2_5_VLTextForCausalLM", + "NeuronQwen2_5_VLForImageEncoding", +] diff --git a/contrib/models/Qwen2.5-VL-7B-Instruct/src/modeling_qwen2_5_vl.py b/contrib/models/Qwen2.5-VL-7B-Instruct/src/modeling_qwen2_5_vl.py new file mode 100644 index 00000000..3a5e216d --- /dev/null +++ b/contrib/models/Qwen2.5-VL-7B-Instruct/src/modeling_qwen2_5_vl.py @@ -0,0 +1,377 @@ +# coding=utf-8 +# Copyright 2025 The Qwen Team. All rights reserved. +# Adapted for Qwen2.5-VL NxDI implementation. +# +# Top-level VL orchestrator: wires text decoder + vision encoder through +# NeuronBaseForImageToText and ImageToTextInferenceConfig. +# Pattern follows qwen2_vl/modeling_qwen2_vl.py. + +import copy +import logging +from typing import Callable, Dict, List, Optional, Tuple, Type, Union + +import torch +from transformers.modeling_outputs import CausalLMOutputWithPast + +from neuronx_distributed_inference.models.config import NeuronConfig +from neuronx_distributed_inference.models.image_to_text_model_base import ( + ImageToTextInferenceConfig, + NeuronBaseForImageToText, +) +from neuronx_distributed_inference.models.model_wrapper import VISION_ENCODER_MODEL_TAG +from neuronx_distributed_inference.models.llama4.utils.encoder_utils import ( + generate_positions_from_mask, + pad_positions, +) + +from src.modeling_qwen2_5_vl_text import ( + NeuronQwen2_5_VLTextModel, + NeuronQwen2_5_VLTextForCausalLM, + Qwen2_5_VLTextModelWrapper, +) +from src.modeling_qwen2_5_vl_vision import ( + NeuronQwen2_5_VLVisionModel, + NeuronQwen2_5_VLForImageEncoding, + Qwen2_5_VLVisionModelWrapper, +) + +logger = logging.getLogger("Neuron") + +# Keys to propagate from top-level config to text_config +QWEN2_5_VL_TEXT_CONFIG_KEYS = [ + "hidden_size", + "num_attention_heads", + "num_hidden_layers", + "num_key_value_heads", + "pad_token_id", + "vocab_size", + "intermediate_size", + "max_position_embeddings", + "rms_norm_eps", + "rope_theta", + "rope_scaling", + "hidden_act", + "bos_token_id", + "eos_token_id", + "qkv_bias", + "o_bias", + "vision_token_id", + "image_token_id", + "video_token_id", + "vision_start_token_id", + "vision_end_token_id", +] + +# Default pixels per image for bucket validation +DEFAULT_PIXELS_PER_IMAGE = 1024 + + +class Qwen2_5_VLInferenceConfig(ImageToTextInferenceConfig): + """Configuration for Qwen2.5-VL multimodal model.""" + + def __init__( + self, + text_neuron_config, + vision_neuron_config, + fused_spec_config=None, + load_config=None, + metadata: Optional[Dict] = None, + **kwargs, + ): + super().__init__( + text_neuron_config=text_neuron_config, + vision_neuron_config=vision_neuron_config, + fused_spec_config=fused_spec_config, + load_config=load_config, + metadata=metadata, + **kwargs, + ) + self.add_special_config() + self.validate_model_supported_configs() + + def add_special_config(self): + """Set Qwen2.5-VL-specific config values.""" + self.num_cores_per_group = 1 + self.qkv_bias = True + self.o_bias = False + + # Vision config: compute head_dim from hidden_size and num_heads + if hasattr(self, "vision_config") and hasattr( + self.vision_config, "hidden_size" + ): + self.vision_config.head_dim = ( + self.vision_config.hidden_size // self.vision_config.num_heads + ) + self.vision_config.num_cores_per_group = 1 + + # Vision config: set default image dimensions and bucket + # vllm's HF processor may resize images larger than 640x640, producing + # more patches than bucket=[1] with 640x640 pixels_per_image can hold. + # Use bucket=[2] for headroom (supports images up to ~2x pixels_per_image patches). + if hasattr(self, "vision_config") and hasattr( + self.vision_config, "neuron_config" + ): + vnc = self.vision_config.neuron_config + if ( + not hasattr(vnc, "default_image_width") + or vnc.default_image_width is None + ): + vnc.default_image_width = 672 + if ( + not hasattr(vnc, "default_image_height") + or vnc.default_image_height is None + ): + vnc.default_image_height = 672 + # Default vision buckets to [2] for vllm compatibility + if not hasattr(vnc, "buckets") or vnc.buckets is None or vnc.buckets == []: + vnc.buckets = [2] + logger.info("Qwen2.5-VL vision: set default buckets=[2]") + + # Copy text config keys + if hasattr(self, "text_config"): + for key in QWEN2_5_VL_TEXT_CONFIG_KEYS: + if hasattr(self, key): + setattr(self.text_config, key, getattr(self, key)) + self.pad_token_id = getattr(self.text_config, "pad_token_id", None) + + def validate_model_supported_configs(self): + # Ensure text_config matches top-level config (set if missing) + for key in QWEN2_5_VL_TEXT_CONFIG_KEYS: + if hasattr(self, key) and hasattr(self.text_config, key): + top_val = getattr(self, key) + text_val = getattr(self.text_config, key) + if top_val != text_val: + logger.warning( + f"Config mismatch: {key}: top={top_val} vs text={text_val}. " + f"Setting text_config.{key} = {top_val}" + ) + setattr(self.text_config, key, top_val) + + # Disable unsupported text features + for unsupported in [ + "is_block_kv_layout", + "is_prefix_caching", + "is_chunked_prefill", + "is_medusa", + "enable_fused_speculation", + ]: + if getattr(self.text_config.neuron_config, unsupported, False) is not False: + setattr(self.text_config.neuron_config, unsupported, False) + logger.warning( + f"Qwen2.5-VL text model does not support '{unsupported}'. Disabled." + ) + + # Disable unsupported vision features + for unsupported in [ + "sequence_parallel_enabled", + "flash_decoding_enabled", + "qkv_kernel_enabled", # Fused RMSNorm+QKV fails: eps type mismatch with vision RMSNorm + "attn_block_tkg_nki_kernel_cache_update", + "attn_block_tkg_nki_kernel_enabled", + ]: + if ( + getattr(self.vision_config.neuron_config, unsupported, False) + is not False + ): + setattr(self.vision_config.neuron_config, unsupported, False) + logger.warning( + f"Qwen2.5-VL vision: '{unsupported}' unsupported, disabled." + ) + + # Vision encoder requires fused_qkv -- enforce it + if not getattr(self.vision_config.neuron_config, "fused_qkv", False): + self.vision_config.neuron_config.fused_qkv = True + logger.warning("Qwen2.5-VL vision: fused_qkv was not set, forcing to True.") + + # Text model also requires fused_qkv + if not getattr(self.text_config.neuron_config, "fused_qkv", False): + self.text_config.neuron_config.fused_qkv = True + logger.warning("Qwen2.5-VL text: fused_qkv was not set, forcing to True.") + + def get_required_attributes(self) -> List[str]: + return [ + "text_config", + "vision_config", + "text_config.hidden_size", + "text_config.num_attention_heads", + "text_config.num_hidden_layers", + "text_config.num_key_value_heads", + "text_config.pad_token_id", + "text_config.vocab_size", + "text_config.max_position_embeddings", + "text_config.rope_theta", + "text_config.rms_norm_eps", + "text_config.hidden_act", + "vision_config.depth", + "vision_config.hidden_size", + "vision_config.num_heads", + "vision_config.in_chans", + "vision_config.patch_size", + "vision_config.spatial_merge_size", + "vision_config.temporal_patch_size", + "vision_config.intermediate_size", + "vision_config.out_hidden_size", + "vision_config.window_size", + "vision_config.fullatt_block_indexes", + ] + + @classmethod + def get_neuron_config_cls(cls) -> Type[NeuronConfig]: + return NeuronConfig + + +class NeuronQwen2_5_VLForCausalLM(NeuronBaseForImageToText): + """Top-level Qwen2.5-VL model for NxDI inference.""" + + text_model_cls = NeuronQwen2_5_VLTextModel + vision_model_cls = NeuronQwen2_5_VLVisionModel + text_model_wrapper = Qwen2_5_VLTextModelWrapper + vision_model_wrapper = Qwen2_5_VLVisionModelWrapper + + def __init__(self, *args, **kwargs): + super().__init__( + self.text_model_cls, + self.vision_model_cls, + self.text_model_wrapper, + self.vision_model_wrapper, + *args, + **kwargs, + ) + + def get_vision_compiler_args(self) -> str: + cc_factor = self.vision_config.neuron_config.cc_pipeline_tiling_factor + return ( + f"--auto-cast=none --model-type=transformer " + f"--tensorizer-options='--enable-ccop-compute-overlap " + f"--cc-pipeline-tiling-factor={cc_factor}' -O1 " + f"--hbm-scratchpad-page-size=1024 " + f"--internal-hlo2tensorizer-options='--verify-hlo=true'" + ) + + def get_compiler_args(self) -> str: + cc_factor = self.text_config.neuron_config.cc_pipeline_tiling_factor + return ( + f"--auto-cast=none --model-type=transformer " + f"--tensorizer-options='--enable-ccop-compute-overlap " + f"--cc-pipeline-tiling-factor={cc_factor}' -O1 " + f"--hbm-scratchpad-page-size=1024 " + f"--internal-hlo2tensorizer-options='--verify-hlo=true'" + ) + + def get_required_kwargs(self) -> List[str]: + return ["pixel_values", "vision_mask", "image_grid_thw"] + + def enable_vision_encoder(self, enable_wlt_optimization=True, **model_init_kwargs): + new_config = copy.deepcopy(self.config) + self.vision_encoder_model = self.vision_model_wrapper( + config=new_config, + model_cls=self.vision_model_cls, + tag=VISION_ENCODER_MODEL_TAG, + compiler_args=self.get_vision_compiler_args(), + model_init_kwargs=model_init_kwargs, + priority_model_idx=(0 if enable_wlt_optimization else None), + pipeline_execution=True, + return_ranked_to_cpu=False, + ) + self.vision_models.append(self.vision_encoder_model) + + @staticmethod + def load_hf_model(model_path, **kwargs): + from transformers import Qwen2_5_VLForConditionalGeneration + + return Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path, **kwargs) + + @staticmethod + def update_state_dict_for_tied_weights(state_dict): + """Handle tied weights (embed_tokens == lm_head) for models like 3B. + Delegates to text model class which clones embed_tokens -> lm_head.""" + NeuronQwen2_5_VLTextForCausalLM.update_state_dict_for_tied_weights(state_dict) + + @staticmethod + def convert_hf_to_neuron_state_dict( + state_dict: dict, inference_config: "Qwen2_5_VLInferenceConfig" + ) -> dict: + """Convert full HF state dict: split into vision + text conversion.""" + state_dict = NeuronQwen2_5_VLForImageEncoding.convert_hf_to_neuron_state_dict( + state_dict, inference_config + ) + state_dict = NeuronQwen2_5_VLTextForCausalLM.convert_hf_to_neuron_state_dict( + state_dict, inference_config.text_config + ) + return state_dict + + def get_padding_length(self, input_ids): + buckets = self.context_encoding_model.config.neuron_config.buckets + for val in buckets: + if val >= input_ids.shape[1]: + return val + raise Exception("No bucket found for provided input_ids!") + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + seq_ids: Optional[torch.LongTensor] = None, + sampling_params: Optional[torch.FloatTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + vision_mask: Optional[torch.FloatTensor] = None, + image_grid_thw: Optional[torch.LongTensor] = None, + adapter_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + use_cache: Optional[bool] = None, + medusa_args=None, + input_capture_hook: Optional[Callable] = None, + tensor_capture_hook: Optional[Callable] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + pad_limit = self.get_padding_length(input_ids) + + if ( + pixel_values is not None + and input_ids.shape[-1] > 1 + and pixel_values.sum() != 0 + ): + # Compute vision mask from image_token_id positions + vision_mask = (input_ids == self.config.image_token_id).unsqueeze(-1) + vision_mask = vision_mask.to(torch.bool) + vision_mask = generate_positions_from_mask(vision_mask.squeeze()) + vision_mask = pad_positions(vision_mask, pad_limit, (pad_limit - 1)) + + # Run vision encoder + vision_embeddings = self.vision_encoder_model( + pixel_values.to(self.vision_config.neuron_config.torch_dtype), + image_grid_thw, + ) + else: + # Text-only or decode step -- use dummy vision inputs + vision_embeddings, vision_mask = ( + self.text_model_wrapper.get_dummy_vision_inputs( + config=self.text_config, + input_ids=input_ids, + n_active_tokens=pad_limit, + fill_value=(pad_limit - 1), + ) + ) + + return super().forward( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + seq_ids=seq_ids, + sampling_params=sampling_params, + input_capture_hook=input_capture_hook, + tensor_capture_hook=tensor_capture_hook, + vision_embeddings=vision_embeddings, + vision_mask=vision_mask, + ) + + @classmethod + def get_config_cls(cls): + return Qwen2_5_VLInferenceConfig + + @classmethod + def prepare_input_args(cls, prompts, images, processor, role="user", config=None): + return NeuronQwen2_5_VLForImageEncoding.prepare_input_args( + prompts, images, processor, role, config + ) diff --git a/contrib/models/Qwen2.5-VL-7B-Instruct/src/modeling_qwen2_5_vl_text.py b/contrib/models/Qwen2.5-VL-7B-Instruct/src/modeling_qwen2_5_vl_text.py new file mode 100644 index 00000000..0b29c3c2 --- /dev/null +++ b/contrib/models/Qwen2.5-VL-7B-Instruct/src/modeling_qwen2_5_vl_text.py @@ -0,0 +1,371 @@ +# coding=utf-8 +# Copyright 2025 The Qwen Team. All rights reserved. +# Adapted for Qwen2.5-VL NxDI implementation. +# +# Text backbone is identical to Qwen2-VL: same GQA, QKV bias, M-RoPE, SwiGLU MLP. +# This file is adapted from NxDI qwen2_vl/modeling_qwen2_vl_text.py. + +import gc +import logging +from typing import Optional, Tuple + +import torch +from torch import nn + +from neuronx_distributed.parallel_layers import parallel_state +from neuronx_distributed.parallel_layers.layers import ( + ColumnParallelLinear, + ParallelEmbedding, +) +from neuronx_distributed.utils import cpu_mode +from neuronx_distributed_inference.modules.custom_calls import CustomRMSNorm +from neuronx_distributed_inference.models.config import InferenceConfig +from neuronx_distributed_inference.models.image_to_text_model_wrapper import ( + ImageToTextModelWrapper, +) +from neuronx_distributed_inference.models.llama.modeling_llama import NeuronLlamaMLP +from neuronx_distributed_inference.models.llama4.utils.encoder_utils import ( + scatter_by_index_put, +) +from neuronx_distributed_inference.models.model_base import ( + NeuronBaseForCausalLM, + NeuronBaseModel, +) +from neuronx_distributed_inference.modules.attention.attention_base import ( + NeuronAttentionBase, +) +from neuronx_distributed_inference.modules.attention.utils import _rotate_half +from transformers.models.llama.modeling_llama import LlamaRMSNorm + +logger = logging.getLogger("Neuron") + + +# M-RoPE: Multimodal Rotary Position Embedding +# Identical to Qwen2-VL -- splits head_dim into [temporal, height, width] sections +# using mrope_section = [16, 24, 24] (doubled for cos/sin pairs) +def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim=1): + mrope_section = mrope_section * 2 + split_indices = [sum(mrope_section[: i + 1]) for i in range(len(mrope_section) - 1)] + cos = torch.cat( + [ + m[i % 3] + for i, m in enumerate(torch.tensor_split(cos, split_indices, dim=-1)) + ], + dim=-1, + ).unsqueeze(unsqueeze_dim) + sin = torch.cat( + [ + m[i % 3] + for i, m in enumerate(torch.tensor_split(sin, split_indices, dim=-1)) + ], + dim=-1, + ).unsqueeze(unsqueeze_dim) + + q_embed = (q * cos) + (_rotate_half(q) * sin) + k_embed = (k * cos) + (_rotate_half(k) * sin) + return q_embed, k_embed + + +def get_rmsnorm_cls(): + return LlamaRMSNorm if cpu_mode() else CustomRMSNorm + + +class NeuronQwen2_5_VLRotaryEmbedding(nn.Module): + """3D Rotary embedding for M-RoPE (temporal, height, width). + Identical to Qwen2-VL rotary embedding.""" + + def __init__(self, config: InferenceConfig, device=None): + super().__init__() + self.dim = config.hidden_size // config.num_attention_heads + self.base = getattr(config, "rope_theta", 1000000.0) + self.attention_scaling = 1.0 + self.register_buffer("inv_freq", None, persistent=False) + self.inv_freq = self.get_inv_freqs(device) + + def get_inv_freqs(self, device=None): + freq_indices = torch.arange(0, self.dim, 2, dtype=torch.float32, device=device) + return 1.0 / (self.base ** (freq_indices / self.dim)) + + def forward(self, x, position_ids): + # position_ids shape: (3, batch_size, seq_len) for [temporal, height, width] + inv_freq_expanded = self.inv_freq[None, None, :, None].expand( + 3, position_ids.shape[0], -1, 1 + ) + position_ids_expanded = position_ids[None, :, None, :].float() + + device_type = ( + x.device.type + if isinstance(x.device.type, str) and x.device.type != "mps" + else "cpu" + ) + with torch.autocast(device_type=device_type, enabled=False): + freqs = ( + inv_freq_expanded.float() @ position_ids_expanded.float() + ).transpose(2, 3) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +class NeuronQwen2_5_VLAttention(NeuronAttentionBase): + """Qwen2.5-VL text attention: GQA with QKV bias and M-RoPE. + Identical to Qwen2-VL text attention.""" + + def __init__(self, config: InferenceConfig, tensor_model_parallel_group=None): + super().__init__( + config=config, + tensor_model_parallel_group=tensor_model_parallel_group, + hidden_size=config.hidden_size, + num_attention_heads=config.num_attention_heads, + num_key_value_heads=config.num_key_value_heads, + head_dim=getattr( + config, "head_dim", config.hidden_size // config.num_attention_heads + ), + num_cores_per_group=config.num_cores_per_group, + qkv_bias=True, + o_bias=False, + rotary_emb=NeuronQwen2_5_VLRotaryEmbedding(config), + rms_norm_eps=config.rms_norm_eps, + attention_chunk_size=getattr(config, "attention_chunk_size", None), + sliding_window=getattr(config, "sliding_window", None), + ) + self.rope_theta = config.rope_theta + self.rope_scaling = config.rope_scaling + self.mrope_section = config.rope_scaling["mrope_section"] + self.padding_side = "right" + + def apply_rotary_embedding( + self, Q, K, V, position_ids, cos_cache, sin_cache, use_polar_compatible_rope + ): + if self.rotary_emb is not None: + if cos_cache is None or sin_cache is None: + cos_cache, sin_cache = self.rotary_emb(V, position_ids) + Q, K = apply_multimodal_rotary_pos_emb( + Q, K, cos_cache, sin_cache, self.mrope_section + ) + return Q, K, cos_cache, sin_cache + + +class NeuronQwen2_5_VLDecoderLayer(nn.Module): + """Pre-norm transformer decoder layer with GQA + SwiGLU MLP.""" + + def __init__(self, config: InferenceConfig): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = NeuronQwen2_5_VLAttention(config) + self.mlp = NeuronLlamaMLP(config) + self.input_layernorm = get_rmsnorm_cls()( + config.hidden_size, eps=config.rms_norm_eps + ) + self.post_attention_layernorm = get_rmsnorm_cls()( + config.hidden_size, eps=config.rms_norm_eps + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + **kwargs, + ) -> Tuple[torch.FloatTensor, ...]: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + hidden_states, present_key_value, cos_cache, sin_cache = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + **kwargs, + ) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states)[0] + hidden_states = residual + hidden_states + + return (hidden_states, present_key_value, cos_cache, sin_cache, None) + + +class Qwen2_5_VLTextModelWrapper(ImageToTextModelWrapper): + """Wrapper for text model that provides dummy vision inputs.""" + + def __init__( + self, + config, + model_cls, + tag="", + compiler_args=None, + priority_model_idx=None, + pipeline_execution=True, + return_ranked_to_cpu=True, + model_init_kwargs={}, + ): + super().__init__( + config, + model_cls, + tag, + compiler_args, + priority_model_idx, + pipeline_execution, + return_ranked_to_cpu, + model_init_kwargs, + ) + + @staticmethod + def get_dummy_vision_inputs(config, input_ids, n_active_tokens, fill_value): + input_batch_size, input_sequence_len = input_ids.shape[0], input_ids.shape[-1] + if input_sequence_len > 1: + vision_embeddings = torch.zeros( + input_batch_size, + config.neuron_config.seq_len, + config.hidden_size, + dtype=config.neuron_config.torch_dtype, + ) + vision_mask = torch.full( + size=(input_batch_size, n_active_tokens, 1), + fill_value=fill_value, + dtype=torch.int32, + ) + else: + vision_embeddings = torch.zeros((0), dtype=config.neuron_config.torch_dtype) + vision_mask = torch.zeros((0), dtype=torch.bool) + return vision_embeddings, vision_mask + + +class NeuronQwen2_5_VLTextModel(NeuronBaseModel): + """Qwen2.5-VL text decoder model. + Identical architecture to Qwen2-VL text model.""" + + def encode_vision_to_input(self, inputs_embeds, vision_embeddings, vision_mask): + return scatter_by_index_put(inputs_embeds, vision_embeddings, vision_mask) + + def setup_attr_for_model(self, config: InferenceConfig): + self.on_device_sampling = ( + config.neuron_config.on_device_sampling_config is not None + ) + self.tp_degree = config.neuron_config.tp_degree + self.hidden_size = config.hidden_size + self.num_attention_heads = config.num_attention_heads + self.num_key_value_heads = config.num_key_value_heads + self.max_batch_size = config.neuron_config.max_batch_size + self.buckets = config.neuron_config.buckets + + def init_model(self, config: InferenceConfig): + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + if parallel_state.model_parallel_is_initialized(): + self.embed_tokens = ParallelEmbedding( + config.vocab_size, + config.hidden_size, + config.pad_token_id, + dtype=config.neuron_config.torch_dtype, + shard_across_embedding=True, + pad=True, + ) + self.lm_head = ColumnParallelLinear( + config.hidden_size, + config.vocab_size, + bias=False, + pad=True, + ) + else: + self.embed_tokens = nn.Embedding( + self.vocab_size, self.hidden_size, self.padding_idx + ) + self.lm_head = nn.Linear(self.hidden_size, self.vocab_size, bias=False) + + self.layers = nn.ModuleList( + [ + NeuronQwen2_5_VLDecoderLayer(config) + for _ in range(config.num_hidden_layers) + ] + ) + self.norm = get_rmsnorm_cls()(config.hidden_size, eps=config.rms_norm_eps) + + +class NeuronQwen2_5_VLTextForCausalLM(NeuronBaseForCausalLM): + """CausalLM wrapper for compilation and state dict conversion.""" + + _model_cls = NeuronQwen2_5_VLTextModel + + @staticmethod + def load_hf_model(model_path): + from transformers import Qwen2_5_VLForConditionalGeneration + + return Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path) + + @staticmethod + def convert_hf_to_neuron_state_dict( + state_dict: dict, inference_config: InferenceConfig + ) -> dict: + """Convert HF state dict for text model. + Qwen2.5-VL uses 'model.' prefix (same as Qwen2-VL, not 'language_model.' like Qwen3-VL).""" + attention_keys = { + ".self_attn.q_proj.": ".self_attn.qkv_proj.q_proj.", + ".self_attn.k_proj.": ".self_attn.qkv_proj.k_proj.", + ".self_attn.v_proj.": ".self_attn.qkv_proj.v_proj.", + ".self_attn.o_proj.": ".self_attn.o_proj.o_proj.", + } + new_state_dict = {} + for dict_key in state_dict: + if "model." in dict_key: + new_key = dict_key.replace("model.", "") + if not inference_config.neuron_config.fused_qkv: + for atten_key in attention_keys: + if atten_key in new_key: + new_key = new_key.replace( + atten_key, attention_keys[atten_key] + ) + new_state_dict[new_key] = state_dict[dict_key] + else: + new_state_dict[dict_key] = state_dict[dict_key] + + if inference_config.neuron_config.fused_qkv: + new_state_dict = _convert_state_dict_to_fused_qkv( + new_state_dict, inference_config + ) + + return new_state_dict + + @staticmethod + def update_state_dict_for_tied_weights(state_dict): + state_dict["lm_head.weight"] = state_dict["embed_tokens.weight"].clone() + + @classmethod + def get_config_cls(cls): + return InferenceConfig + + +# --- Fused QKV helpers --- + + +def _helper_concat_and_delete_qkv(state_dict, layer_num, attr): + state_dict[f"layers.{layer_num}.self_attn.Wqkv.{attr}"] = torch.cat( + [ + state_dict[f"layers.{layer_num}.self_attn.q_proj.{attr}"], + state_dict[f"layers.{layer_num}.self_attn.k_proj.{attr}"], + state_dict[f"layers.{layer_num}.self_attn.v_proj.{attr}"], + ] + ) + del state_dict[f"layers.{layer_num}.self_attn.q_proj.{attr}"] + del state_dict[f"layers.{layer_num}.self_attn.k_proj.{attr}"] + del state_dict[f"layers.{layer_num}.self_attn.v_proj.{attr}"] + + +def _convert_state_dict_to_fused_qkv(state_dict, cfg: InferenceConfig): + mods_to_not_conv = getattr(cfg.neuron_config, "modules_to_not_convert", None) or [] + for layer in range(cfg.num_hidden_layers): + _helper_concat_and_delete_qkv(state_dict, layer, "weight") + _helper_concat_and_delete_qkv(state_dict, layer, "bias") + if ( + cfg.neuron_config.quantized_mlp_kernel_enabled + or cfg.neuron_config.quantized + ) and f"layers.{layer}.self_attn" not in mods_to_not_conv: + _helper_concat_and_delete_qkv(state_dict, layer, "scale") + gc.collect() + return state_dict diff --git a/contrib/models/Qwen2.5-VL-7B-Instruct/src/modeling_qwen2_5_vl_vision.py b/contrib/models/Qwen2.5-VL-7B-Instruct/src/modeling_qwen2_5_vl_vision.py new file mode 100644 index 00000000..e99dcba5 --- /dev/null +++ b/contrib/models/Qwen2.5-VL-7B-Instruct/src/modeling_qwen2_5_vl_vision.py @@ -0,0 +1,759 @@ +# coding=utf-8 +# Copyright 2025 The Qwen Team. All rights reserved. +# Adapted for Qwen2.5-VL NxDI implementation. +# +# Vision encoder for Qwen2.5-VL: +# - RMSNorm (not LayerNorm like Qwen2-VL) +# - Gated SwiGLU MLP with bias=True (unique to Qwen2.5-VL) +# - Hybrid windowed/global attention (window_size=112, fullatt_block_indexes=[7,15,23,31]) +# - 2D spatial rotary position embeddings (like Qwen2-VL, not learned like Qwen3-VL) +# - Patch merger with RMSNorm + GELU MLP +# +# Architecture pattern follows qwen3_vl: patch embed + pos embed on CPU in wrapper, +# compiled model only handles ViT blocks + merger. Bucketed on vision sequence length. + +import logging +import math +from typing import List, Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers.activations import ACT2FN + +from neuronx_distributed.parallel_layers.layers import ( + ColumnParallelLinear, + RowParallelLinear, +) +from neuronx_distributed_inference.models.application_base import NeuronApplicationBase +from neuronx_distributed_inference.models.config import InferenceConfig +from neuronx_distributed_inference.models.model_wrapper import ( + EncoderModelInstance, + ModelWrapper, +) +from neuronx_distributed_inference.modules.padding import ( + pad_tensor, + pad_with_first_batchline, +) +from neuronx_distributed_inference.modules.attention.attention_base import ( + NeuronAttentionBase, +) +from neuronx_distributed_inference.modules.attention.utils import apply_rotary_pos_emb + +logger = logging.getLogger("Neuron") + + +# ============================================================================= +# Vision encoder components +# ============================================================================= + + +class Qwen2_5_VLVisionRMSNorm(nn.Module): + """RMSNorm for vision encoder (eps=1e-6).""" + + def __init__(self, hidden_size: int, eps: float = 1e-6, dtype=torch.bfloat16): + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size, dtype=dtype)) + self.eps = eps + + def forward(self, x: torch.Tensor) -> torch.Tensor: + input_dtype = x.dtype + x = x.float() + variance = x.pow(2).mean(-1, keepdim=True) + x = x * torch.rsqrt(variance + self.eps) + return (self.weight * x).to(input_dtype) + + +class Qwen2_5_VLVisionRotaryEmbedding(nn.Module): + """Pass-through rotary embedding that receives precomputed (cos, sin) from wrapper.""" + + @torch.inference_mode() + def forward(self, x, position_embeddings): + cos, sin = position_embeddings + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +class VisionRotaryEmbedding(nn.Module): + """Precomputes rotary embeddings for vision encoder. + Creates a lookup table of shape (max_grid_size, dim) that is indexed by position IDs.""" + + inv_freq: torch.Tensor + + def __init__(self, dim: int, theta: float = 10000.0) -> None: + super().__init__() + inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + def forward(self, seqlen: int) -> torch.Tensor: + seq = torch.arange( + seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype + ) + freqs = torch.outer(seq, self.inv_freq) + return freqs + + +class Qwen2_5_VLVisionSwiGLUMLP(nn.Module): + """Gated SwiGLU MLP with bias=True -- unique to Qwen2.5-VL vision encoder. + gate_proj and up_proj are computed in parallel, then: silu(gate) * up -> down.""" + + def __init__(self, hidden_size: int, intermediate_size: int, dtype=torch.bfloat16): + super().__init__() + self.gate_proj = ColumnParallelLinear( + hidden_size, intermediate_size, bias=True, gather_output=False, dtype=dtype + ) + self.up_proj = ColumnParallelLinear( + hidden_size, intermediate_size, bias=True, gather_output=False, dtype=dtype + ) + self.down_proj = RowParallelLinear( + intermediate_size, + hidden_size, + bias=True, + input_is_parallel=True, + dtype=dtype, + reduce_dtype=dtype, + ) + self.act_fn = nn.SiLU() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + +class Qwen2_5_VLVisionAttention(NeuronAttentionBase): + """MHA (not GQA) vision attention with precomputed rotary embeddings. + All 16 heads are attention heads (no KV head grouping).""" + + def __init__(self, config): + super().__init__( + config=config, + hidden_size=config.hidden_size, + num_attention_heads=config.num_heads, + num_key_value_heads=config.num_heads, # MHA -- all heads are KV heads + head_dim=config.head_dim, + num_cores_per_group=config.num_cores_per_group, + sequence_parallel_enabled=False, + rotary_emb=Qwen2_5_VLVisionRotaryEmbedding(), + qkv_bias=True, + o_bias=True, + ) + + def forward(self, hidden_states, position_embeddings=None, **kwargs): + self._position_embeddings = position_embeddings + try: + return super().forward(hidden_states, **kwargs) + finally: + self._position_embeddings = None + + def apply_rotary_embedding( + self, Q, K, V, position_ids, cos_cache, sin_cache, use_polar_compatible_rope + ): + if self.rotary_emb is not None: + if cos_cache is None or sin_cache is None: + cos_cache, sin_cache = self.rotary_emb(V, self._position_embeddings) + Q, K = apply_rotary_pos_emb(Q, K, cos_cache, sin_cache) + return Q, K, cos_cache, sin_cache + + +class Qwen2_5_VLVisionBlock(nn.Module): + """Pre-norm vision transformer block: RMSNorm + Attention + RMSNorm + SwiGLU MLP.""" + + def __init__(self, vision_config): + super().__init__() + dtype = vision_config.neuron_config.torch_dtype + self.norm1 = Qwen2_5_VLVisionRMSNorm( + vision_config.hidden_size, eps=1e-6, dtype=dtype + ) + self.norm2 = Qwen2_5_VLVisionRMSNorm( + vision_config.hidden_size, eps=1e-6, dtype=dtype + ) + self.attn = Qwen2_5_VLVisionAttention(vision_config) + self.mlp = Qwen2_5_VLVisionSwiGLUMLP( + hidden_size=vision_config.hidden_size, + intermediate_size=vision_config.intermediate_size, + dtype=dtype, + ) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + ) -> torch.Tensor: + attn_output = self.attn( + self.norm1(hidden_states), position_embeddings=position_embeddings + )[0] + hidden_states = hidden_states + attn_output + hidden_states = hidden_states + self.mlp(self.norm2(hidden_states)) + return hidden_states + + +class Qwen2_5_VLPatchMerger(nn.Module): + """Merge spatial_merge_size x spatial_merge_size patches into one token. + Uses RMSNorm (unlike Qwen2-VL which uses LayerNorm).""" + + def __init__( + self, + dim: int, + context_dim: int, + spatial_merge_size: int = 2, + dtype=torch.bfloat16, + ): + super().__init__() + self.hidden_size = context_dim * (spatial_merge_size**2) + self.ln_q = Qwen2_5_VLVisionRMSNorm(context_dim, eps=1e-6, dtype=dtype) + self.mlp = nn.Sequential( + ColumnParallelLinear( + self.hidden_size, self.hidden_size, gather_output=False, dtype=dtype + ), + nn.GELU(), + RowParallelLinear( + self.hidden_size, + dim, + input_is_parallel=True, + dtype=dtype, + reduce_dtype=dtype, + ), + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.mlp(self.ln_q(x).view(-1, self.hidden_size)) + + +# ============================================================================= +# Windowed attention utilities (run on CPU in wrapper) +# ============================================================================= + + +def get_window_index(grid_thw, window_size, spatial_merge_size, patch_size): + """Compute window partition indices for windowed attention. + + For each image, partitions the merged-patch grid into spatial windows. + Returns permutation indices that group tokens by window, and cumulative + sequence lengths for both windowed and full attention. + + Args: + grid_thw: Tensor of shape (num_images, 3) with [T, H, W] per image + window_size: Pixel-space window size (e.g., 112) + spatial_merge_size: Spatial merge factor (e.g., 2) + patch_size: Patch size in pixels (e.g., 14) + + Returns: + window_index: 1D tensor -- permutation to reorder tokens by window + cu_window_seqlens: 1D tensor -- cumulative sequence lengths for windows + """ + vit_merger_window_size = ( + window_size // spatial_merge_size // patch_size + ) # typically 4 + + all_window_indices = [] + all_window_seqlens = [] + + spatial_merge_unit = spatial_merge_size * spatial_merge_size # 4 + offset = 0 + + for t, h, w in grid_thw.tolist(): + t, h, w = int(t), int(h), int(w) + llm_grid_h = h // spatial_merge_size + llm_grid_w = w // spatial_merge_size + + index = torch.arange(t * llm_grid_h * llm_grid_w).reshape( + t, llm_grid_h, llm_grid_w + ) + + # Pad to window-aligned size + pad_h = ( + vit_merger_window_size - llm_grid_h % vit_merger_window_size + ) % vit_merger_window_size + pad_w = ( + vit_merger_window_size - llm_grid_w % vit_merger_window_size + ) % vit_merger_window_size + + if pad_h > 0 or pad_w > 0: + index = F.pad(index, (0, pad_w, 0, pad_h), value=-100) + + padded_h = llm_grid_h + pad_h + padded_w = llm_grid_w + pad_w + num_windows_h = padded_h // vit_merger_window_size + num_windows_w = padded_w // vit_merger_window_size + + # Reshape into windows + index = index.reshape( + t, + num_windows_h, + vit_merger_window_size, + num_windows_w, + vit_merger_window_size, + ) + index = index.permute(0, 1, 3, 2, 4) # (T, nWh, nWw, wH, wW) + index = index.reshape( + t, + num_windows_h * num_windows_w, + vit_merger_window_size, + vit_merger_window_size, + ) + + # Count valid tokens per window and build sequence lengths + seqlens = (index != -100).sum(dim=[2, 3]).reshape(-1) # (T * num_windows,) + + # Extract valid indices + valid_mask = index != -100 + valid_indices = index[valid_mask] # Flat list of valid indices + + all_window_indices.append(valid_indices + offset) + all_window_seqlens.append(seqlens * spatial_merge_unit) + + offset += t * llm_grid_h * llm_grid_w + + window_index = torch.cat(all_window_indices) + cu_window_seqlens_raw = torch.cat(all_window_seqlens) + + # Build cumulative sequence lengths + cu_window_seqlens = torch.zeros(len(cu_window_seqlens_raw) + 1, dtype=torch.int32) + cu_window_seqlens[1:] = cu_window_seqlens_raw.cumsum(0) + + return window_index, cu_window_seqlens + + +def compute_vision_rotary_pos_emb(grid_thw, spatial_merge_size, head_dim): + """Compute 2D spatial rotary position embeddings for vision tokens. + + Args: + grid_thw: Tensor of shape (num_images, 3) with [T, H, W] + spatial_merge_size: Spatial merge factor (2) + head_dim: Attention head dimension (80) + + Returns: + cos, sin: Tensors of shape (num_images, max_seq_per_image, head_dim) + """ + # Build position IDs per image + pos_ids_list = [] + for t, h, w in grid_thw.tolist(): + t, h, w = int(t), int(h), int(w) + hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) + hpos_ids = ( + hpos_ids.reshape( + h // spatial_merge_size, + spatial_merge_size, + w // spatial_merge_size, + spatial_merge_size, + ) + .permute(0, 2, 1, 3) + .flatten() + ) + + wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) + wpos_ids = ( + wpos_ids.reshape( + h // spatial_merge_size, + spatial_merge_size, + w // spatial_merge_size, + spatial_merge_size, + ) + .permute(0, 2, 1, 3) + .flatten() + ) + + pos_ids_list.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) + + pos_ids = torch.cat(pos_ids_list, dim=0) + + # Compute rotary embeddings + max_grid_size = max(grid_thw[:, 1:].max().item(), 1) + dim = head_dim // 2 + inv_freq = 1.0 / (10000.0 ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) + + # Build full embedding table + freqs = torch.arange(max_grid_size, dtype=torch.float32).unsqueeze( + 1 + ) * inv_freq.unsqueeze(0) + emb_cache = torch.cat((freqs, freqs), dim=-1) # (max_grid_size, dim) + + # Index into cache + rotary_pos_emb = emb_cache[pos_ids].flatten(1) # (total_patches, head_dim) + emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1) + cos_emb = emb.cos() + sin_emb = emb.sin() + + # Reshape per image batch + cos_emb = cos_emb.reshape(grid_thw.shape[0], -1, cos_emb.shape[-1]) + sin_emb = sin_emb.reshape(grid_thw.shape[0], -1, sin_emb.shape[-1]) + + return cos_emb, sin_emb + + +# ============================================================================= +# Neuron vision model (compiled) +# ============================================================================= + + +class NeuronQwen2_5_VLVisionModel(nn.Module): + """Vision encoder that runs on Neuron. + Patch embed, rotary pos embed, and window index computation happen on CPU in the wrapper. + This model only contains the ViT blocks and patch merger.""" + + def __init__(self, config: InferenceConfig): + super().__init__() + self.config = config + self.vision_config = config.vision_config + + dtype = self.vision_config.neuron_config.torch_dtype + self.spatial_merge_size = self.vision_config.spatial_merge_size + + # Conv3D patch embedding + from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import ( + Qwen2_5_VisionPatchEmbed, + ) + + self.patch_embed = Qwen2_5_VisionPatchEmbed( + patch_size=self.vision_config.patch_size, + temporal_patch_size=self.vision_config.temporal_patch_size, + in_channels=self.vision_config.in_chans, + embed_dim=self.vision_config.hidden_size, + ).to(dtype) + + # Rotary embedding (precomputed cache, same pattern as Qwen2-VL NxDI) + head_dim = self.vision_config.hidden_size // self.vision_config.num_heads + self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2) + + # Calculate max grid size from default image dimensions + default_w = getattr( + self.vision_config.neuron_config, "default_image_width", 640 + ) + default_h = getattr( + self.vision_config.neuron_config, "default_image_height", 640 + ) + from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize + + resized_h, resized_w = smart_resize(width=default_w, height=default_h) + self.max_grid_size = ( + max(resized_h, resized_w) // self.vision_config.patch_size + 1 + ) + + # Precompute and cache rotary embeddings + precomputed = self.rotary_pos_emb(self.max_grid_size) + self.register_buffer("rotary_pos_emb_cache", precomputed, persistent=False) + + # ViT blocks + self.blocks = nn.ModuleList( + [ + Qwen2_5_VLVisionBlock(self.vision_config) + for _ in range(self.vision_config.depth) + ] + ) + + # Patch merger: projects from vision hidden_size to text hidden_size + self.merger = Qwen2_5_VLPatchMerger( + dim=self.vision_config.out_hidden_size, + context_dim=self.vision_config.hidden_size, + spatial_merge_size=self.vision_config.spatial_merge_size, + dtype=dtype, + ) + + # Store fullatt_block_indexes for windowed vs global attention routing + self.fullatt_block_indexes = set(self.vision_config.fullatt_block_indexes) + + def rot_pos_ids(self, grid_thw): + """Compute position IDs for rotary embedding indexing.""" + pos_ids = [] + for t, h, w in grid_thw: + hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) + hpos_ids = hpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ) + hpos_ids = hpos_ids.permute(0, 2, 1, 3).flatten() + + wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) + wpos_ids = wpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ) + wpos_ids = wpos_ids.permute(0, 2, 1, 3).flatten() + + pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) + pos_ids = torch.cat(pos_ids, dim=0) + return pos_ids + + def pad_to_text_seq_len(self, hidden_states): + """Pad merged vision tokens to text sequence length.""" + padded_length = self.config.neuron_config.seq_len + hidden_states = hidden_states.to( + self.config.text_config.neuron_config.torch_dtype + ) + hidden_size = hidden_states.shape[-1] + hidden_states, _ = pad_tensor( + hidden_states, (padded_length, hidden_size), pad_value=0 + ) + hidden_states = hidden_states.view(-1, hidden_size).unsqueeze(0) + return hidden_states + + def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor): + """Forward pass through vision encoder. + + Args: + hidden_states: Raw pixel patches. + Shape: (total_patches, in_chans * patch_size * patch_size * temporal_patch_size) + grid_thw: Grid dimensions per image. Shape: (num_images, 3) + + Returns: + Merged vision features padded to text seq_len. + Shape: (1, seq_len, text_hidden_size) + """ + # Patch embed: raw pixels -> hidden_size + hidden_states = self.patch_embed(hidden_states) + + # Compute rotary position embeddings (via precomputed cache) + pos_ids = self.rot_pos_ids(grid_thw) + rotary_pos_emb = self.rotary_pos_emb_cache[pos_ids].flatten(1) + emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1) + cos_emb = emb.cos() + sin_emb = emb.sin() + cos_emb = cos_emb.reshape(grid_thw.shape[0], -1, cos_emb.shape[-1]) + sin_emb = sin_emb.reshape(grid_thw.shape[0], -1, sin_emb.shape[-1]) + position_embeddings = (cos_emb, sin_emb) + + # Reshape to (num_images, seq_per_image, hidden_size) + hidden_states = hidden_states.reshape( + grid_thw.shape[0], -1, hidden_states.shape[-1] + ) + + # Run through ViT blocks + for blk in self.blocks: + hidden_states = blk(hidden_states, position_embeddings) + + # Merge patches and project to text hidden dim + hidden_states = self.merger(hidden_states) + + return self.pad_to_text_seq_len(hidden_states) + + +# ============================================================================= +# Vision model wrapper (handles CPU preprocessing) +# ============================================================================= + + +class Qwen2_5_VLVisionModelWrapper(ModelWrapper): + """Wrapper that handles patch embedding, rotary position computation, + and windowed attention preprocessing on CPU before passing to the + compiled vision model on Neuron.""" + + def __init__( + self, + config, + model_cls, + tag="", + compiler_args=None, + priority_model_idx=None, + pipeline_execution=True, + return_ranked_to_cpu=False, + model_init_kwargs={}, + ): + super().__init__( + config, + model_cls, + tag, + compiler_args, + priority_model_idx, + pipeline_execution, + return_ranked_to_cpu, + model_init_kwargs, + ) + # Calculate image dimensions for bucketing + vc = self.config.vision_config + default_w = getattr(vc.neuron_config, "default_image_width", 640) + default_h = getattr(vc.neuron_config, "default_image_height", 640) + from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize + + resized_h, resized_w = smart_resize(width=default_w, height=default_h) + self.pixels_per_image = (resized_h // vc.patch_size) * ( + resized_w // vc.patch_size + ) + + def input_generator(self) -> List[Tuple[torch.Tensor]]: + """Generate input shapes for each vision bucket (number of images).""" + inputs = [] + vc = self.config.vision_config + dtype = vc.neuron_config.torch_dtype + + default_w = getattr(vc.neuron_config, "default_image_width", 640) + default_h = getattr(vc.neuron_config, "default_image_height", 640) + from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize + + resized_h, resized_w = smart_resize(width=default_w, height=default_h) + + for bucket in vc.neuron_config.buckets: + pixel_values = torch.ones( + [ + bucket * self.pixels_per_image, + vc.in_chans + * vc.patch_size + * vc.patch_size + * vc.temporal_patch_size, + ], + dtype=dtype, + ) + grid_thw = torch.tensor( + [[1, resized_h // vc.patch_size, resized_w // vc.patch_size]] + ).repeat(bucket, 1) + inputs.append((pixel_values, grid_thw)) + + return inputs + + def get_model_instance(self): + return EncoderModelInstance(model_cls=self.model_cls, config=self.config) + + def get_padded_num_image(self, pixel_values): + buckets = self.config.vision_config.neuron_config.buckets + for val in buckets: + if val * self.pixels_per_image >= pixel_values.shape[0]: + return val + raise Exception( + f"No bucket found for pixel_values with shape {pixel_values.shape[0]}. " + f"pixels_per_image={self.pixels_per_image}, buckets={buckets}" + ) + + def forward(self, pixel_values, grid_thw): + """Override forward: pad inputs and call compiled model.""" + if self.model is None: + raise RuntimeError("Forward called before load()") + + padded_num_image = self.get_padded_num_image(pixel_values) + padded_pixel_values = pad_with_first_batchline( + pixel_values, + (padded_num_image * self.pixels_per_image, pixel_values.shape[1]), + ) + padded_grid_thw = pad_with_first_batchline(grid_thw, (padded_num_image, 3)) + return self._forward(padded_pixel_values, padded_grid_thw) + + +# ============================================================================= +# NeuronApplication class +# ============================================================================= + + +class NeuronQwen2_5_VLForImageEncoding(NeuronApplicationBase): + """Neuron application class for Qwen2.5-VL vision encoder.""" + + _model_cls = NeuronQwen2_5_VLVisionModel + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.model_wrapper = self.get_model_wrapper_cls() + self.model = self.model_wrapper( + config=self.config, + model_cls=self._model_cls, + tag=self._model_cls.__name__, + compiler_args=self.get_compiler_args(), + priority_model_idx=0, + ) + self.models.append(self.model) + + def get_model_wrapper_cls(self): + return Qwen2_5_VLVisionModelWrapper + + def forward(self, pixel_values, grid_thw): + return self.models[0](pixel_values, grid_thw) + + def get_compiler_args(self): + return ( + "--auto-cast=none --model-type=transformer " + "--tensorizer-options='--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2' " + "-O1 --internal-hlo2tensorizer-options='--verify-hlo=true'" + ) + + @staticmethod + def update_state_dict_for_tied_weights(state_dict): + pass + + @staticmethod + def load_hf_model(model_path, **kwargs): + from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLConfig + + class HFVisionModel(nn.Module): + def __init__(self, model_path, **kwargs): + super().__init__() + self.hf_config = Qwen2_5_VLConfig.from_pretrained(model_path, **kwargs) + # Load just the vision model + full_model = Qwen2_5_VLForConditionalGeneration.from_pretrained( + model_path, torch_dtype=torch.bfloat16, **kwargs + ) + self.visual = full_model.model.visual + del full_model + + def forward(self, pixel_values, grid_thw): + return self.visual(pixel_values, grid_thw) + + return HFVisionModel(model_path, **kwargs) + + @staticmethod + def convert_hf_to_neuron_state_dict( + state_dict: dict, inference_config: InferenceConfig + ) -> dict: + """Convert HF vision state dict to Neuron format. + Maps: visual.blocks.N.attn.qkv -> blocks.N.attn.qkv_proj.Wqkv + Maps: visual.blocks.N.attn.proj -> blocks.N.attn.o_proj + Preserves all non-vision keys for downstream text conversion.""" + new_state_dict = {} + for key, value in state_dict.items(): + if "visual." in key: + new_key = key.replace("visual.", "") + if ".attn.qkv." in new_key: + new_key = new_key.replace(".attn.qkv.", ".attn.qkv_proj.Wqkv.") + elif ".attn.proj." in new_key: + new_key = new_key.replace(".attn.proj.", ".attn.o_proj.") + else: + new_key = key + new_state_dict[new_key] = ( + value.clone() + .detach() + .contiguous() + .to(inference_config.vision_config.neuron_config.torch_dtype) + ) + del state_dict + return new_state_dict + + @classmethod + def get_config_cls(cls): + from src.modeling_qwen2_5_vl import Qwen2_5_VLInferenceConfig + + return Qwen2_5_VLInferenceConfig + + @classmethod + def prepare_input_args(cls, prompts, images, processor, role="user", config=None): + """Prepare inputs for generation -- single batch only for now.""" + if len(prompts) > 1: + raise NotImplementedError("Qwen2.5-VL currently only supports batch size 1") + if isinstance(prompts, list): + prompts = prompts[0] + if images and isinstance(images, list) and isinstance(images[0], list): + images = images[0] + + # Build conversation format + content = [] + if images: + for img in images if isinstance(images, list) else [images]: + content.append({"type": "image", "image": img}) + content.append({"type": "text", "text": prompts}) + + messages = [{"role": role, "content": content}] + text = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + inputs = processor( + text=[text], + images=images if images else None, + padding=True, + return_tensors="pt", + ) + + vision_inputs = None + if hasattr(inputs, "pixel_values") and inputs.pixel_values is not None: + vision_inputs = { + "pixel_values": inputs.pixel_values, + "image_grid_thw": inputs.image_grid_thw, + } + + return inputs.input_ids, inputs.attention_mask, vision_inputs diff --git a/contrib/models/Qwen2.5-VL-32B-Instruct/test/__init__.py b/contrib/models/Qwen2.5-VL-7B-Instruct/test/__init__.py similarity index 100% rename from contrib/models/Qwen2.5-VL-32B-Instruct/test/__init__.py rename to contrib/models/Qwen2.5-VL-7B-Instruct/test/__init__.py diff --git a/contrib/models/Qwen2.5-VL-32B-Instruct/test/integration/__init__.py b/contrib/models/Qwen2.5-VL-7B-Instruct/test/integration/__init__.py similarity index 100% rename from contrib/models/Qwen2.5-VL-32B-Instruct/test/integration/__init__.py rename to contrib/models/Qwen2.5-VL-7B-Instruct/test/integration/__init__.py diff --git a/contrib/models/Qwen2.5-VL-7B-Instruct/test/integration/test_model.py b/contrib/models/Qwen2.5-VL-7B-Instruct/test/integration/test_model.py new file mode 100644 index 00000000..f9a029ba --- /dev/null +++ b/contrib/models/Qwen2.5-VL-7B-Instruct/test/integration/test_model.py @@ -0,0 +1,499 @@ +#!/usr/bin/env python3 +""" +Integration tests for Qwen2.5-VL-7B-Instruct NeuronX implementation. + +Tests cover: + - Model loading from pre-compiled artifacts + - Text-only generation with logit validation + - Vision-language generation with synthetic image + - Performance metrics (TTFT, throughput) + +Prerequisites: + - Pre-compiled model at COMPILED_MODEL_PATH + - HuggingFace weights at MODEL_PATH + - Neuron device available (trn2 or inf2) + +Usage: + # Run all tests + pytest test/integration/test_model.py -v --capture=tee-sys + + # Run specific test + pytest test/integration/test_model.py::test_text_generation -v +""" + +import os +import sys +import time +import logging + +import pytest +import torch +from pathlib import Path + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# Add src directory to path +sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) + +# ---- Configuration ---- +# Override via environment variables if needed +MODEL_PATH = os.environ.get("QWEN25VL_MODEL_PATH", "/mnt/models/Qwen2.5-VL-7B-Instruct") +COMPILED_MODEL_PATH = os.environ.get( + "QWEN25VL_COMPILED_PATH", "/mnt/models/qwen25vl_compiled" +) +TP_DEGREE = int(os.environ.get("QWEN25VL_TP_DEGREE", "4")) +SEQ_LEN = int(os.environ.get("QWEN25VL_SEQ_LEN", "4096")) + +# Token IDs for Qwen2.5-VL +BOS_TOKEN_ID = 151643 +EOS_TOKEN_ID = 151645 +PAD_TOKEN_ID = 151645 + + +# ---- Fixtures ---- + + +@pytest.fixture(scope="module") +def model_and_adapter(): + """Load pre-compiled model and create generation adapter.""" + from neuronx_distributed_inference.models.config import NeuronConfig + from neuronx_distributed_inference.utils.hf_adapter import ( + load_pretrained_config, + HuggingFaceGenerationAdapter, + ) + from modeling_qwen2_5_vl import ( + NeuronQwen2_5_VLForCausalLM, + Qwen2_5_VLInferenceConfig, + ) + + text_neuron_config = NeuronConfig( + batch_size=1, + ctx_batch_size=1, + seq_len=SEQ_LEN, + tp_degree=TP_DEGREE, + world_size=TP_DEGREE, + torch_dtype=torch.bfloat16, + save_sharded_checkpoint=True, + fused_qkv=True, + qkv_kernel_enabled=True, + mlp_kernel_enabled=False, + attn_kernel_enabled=True, + attn_tkg_nki_kernel_enabled=True, + logical_neuron_cores=2, + cc_pipeline_tiling_factor=2, + on_device_sampling_config=None, + cast_type="as-declared", + enable_bucketing=True, # Multi-bucket CTE for TTFT optimization + context_encoding_buckets=[ + 512, + 1024, + 2048, + 4096, + ], # Min 512 for TKG NKI kernel compat + token_generation_buckets=[4096], # Single TKG bucket at full seq_len + ) + + vision_neuron_config = NeuronConfig( + batch_size=1, + seq_len=SEQ_LEN, + tp_degree=TP_DEGREE, + world_size=TP_DEGREE, + save_sharded_checkpoint=True, + torch_dtype=torch.bfloat16, + fused_qkv=True, + attn_kernel_enabled=True, # Flash attention for bidirectional vision + mlp_kernel_enabled=False, + qkv_kernel_enabled=False, # Fused RMSNorm+QKV fails with vision RMSNorm eps type + cc_pipeline_tiling_factor=2, + cast_type="as-declared", + logical_neuron_cores=2, + enable_bucketing=True, + buckets=[2], + ) + + config = Qwen2_5_VLInferenceConfig( + text_neuron_config=text_neuron_config, + vision_neuron_config=vision_neuron_config, + load_config=load_pretrained_config(MODEL_PATH), + ) + + logger.info("Loading compiled model from %s", COMPILED_MODEL_PATH) + model = NeuronQwen2_5_VLForCausalLM(model_path=MODEL_PATH, config=config) + model.load(COMPILED_MODEL_PATH) + + adapter = HuggingFaceGenerationAdapter(model) + logger.info("Model loaded successfully.") + return model, adapter + + +@pytest.fixture(scope="module") +def processor(): + """Load the Qwen2.5-VL processor (tokenizer + image processor).""" + from transformers import AutoProcessor + + proc = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True) + return proc + + +@pytest.fixture(scope="module") +def generation_config(): + """Standard generation config for tests.""" + from transformers import GenerationConfig + + return GenerationConfig( + do_sample=False, + bos_token_id=BOS_TOKEN_ID, + eos_token_id=[EOS_TOKEN_ID], + pad_token_id=PAD_TOKEN_ID, + ) + + +@pytest.fixture(scope="module") +def sampling_params(): + """Sampling params for greedy decoding.""" + from neuronx_distributed_inference.modules.generation.sampling import ( + prepare_sampling_params, + ) + + return prepare_sampling_params( + batch_size=1, top_k=[1], top_p=[1.0], temperature=[1.0] + ) + + +# ---- Helper functions ---- + + +def generate_text( + adapter, processor, prompt, gen_config, samp_params, max_new_tokens=64 +): + """Run text-only generation and return output text + metadata.""" + messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}] + text = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + inputs = processor(text=[text], return_tensors="pt") + + start = time.time() + with torch.no_grad(): + generated = adapter.generate( + inputs.input_ids, + attention_mask=inputs.attention_mask, + sampling_params=samp_params, + generation_config=gen_config, + max_new_tokens=max_new_tokens, + ) + elapsed = time.time() - start + + output_ids = generated[0][inputs.input_ids.shape[1] :] + output_text = processor.decode(output_ids, skip_special_tokens=True) + return output_text, len(output_ids), elapsed + + +def generate_vl( + adapter, processor, prompt, image, gen_config, samp_params, max_new_tokens=64 +): + """Run vision-language generation and return output text + metadata.""" + messages = [ + { + "role": "user", + "content": [ + {"type": "image", "image": image}, + {"type": "text", "text": prompt}, + ], + } + ] + text = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt") + + start = time.time() + with torch.no_grad(): + generated = adapter.generate( + inputs.input_ids, + attention_mask=inputs.attention_mask, + pixel_values=inputs.pixel_values, + image_grid_thw=inputs.image_grid_thw, + sampling_params=samp_params, + generation_config=gen_config, + max_new_tokens=max_new_tokens, + ) + elapsed = time.time() - start + + output_ids = generated[0][inputs.input_ids.shape[1] :] + output_text = processor.decode(output_ids, skip_special_tokens=True) + return output_text, len(output_ids), elapsed + + +def make_test_image(width=224, height=224, color="red", shape="circle"): + """Create a synthetic test image.""" + from PIL import Image, ImageDraw + + img = Image.new("RGB", (width, height), "white") + draw = ImageDraw.Draw(img) + margin = min(width, height) // 8 + if shape == "circle": + draw.ellipse( + [margin, margin, width - margin, height - margin], + fill=color, + outline="black", + width=2, + ) + elif shape == "rectangle": + draw.rectangle( + [margin, margin, width - margin, height - margin], + fill=color, + outline="black", + width=2, + ) + return img + + +# ---- Tests ---- + + +def test_model_loads(model_and_adapter): + """Smoke test: model loads successfully from compiled artifacts.""" + model, adapter = model_and_adapter + assert model is not None + assert adapter is not None + assert hasattr(model, "config") + assert model.config.text_config.hidden_size == 3584 + assert model.config.text_config.num_hidden_layers == 28 + logger.info("PASS: Model loaded (hidden_size=3584, layers=28)") + + +def test_text_generation( + model_and_adapter, processor, generation_config, sampling_params +): + """Test text-only generation produces correct output.""" + _, adapter = model_and_adapter + output, num_tokens, elapsed = generate_text( + adapter, + processor, + "What is the capital of France? Answer in one sentence.", + generation_config, + sampling_params, + max_new_tokens=32, + ) + logger.info("Text output (%d tokens, %.2fs): %s", num_tokens, elapsed, output) + + assert num_tokens > 0, "Should generate at least one token" + assert "Paris" in output, f"Expected 'Paris' in output, got: {output}" + + +def test_logit_validation( + model_and_adapter, processor, generation_config, sampling_params +): + """Validate Neuron logits against CPU HF reference using NxDI logit_validation. + + Uses teacher forcing to compare per-position logit distributions, not just + top-1 token matching. + """ + from neuronx_distributed_inference.experimental.core.accuracy.logit_validation import ( + logit_validation, + ) + from transformers import GenerationConfig + + _, adapter = model_and_adapter + prompt = "What is the capital of France? Answer in one sentence." + messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}] + text = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + inputs = processor(text=[text], return_tensors="pt") + input_ids = inputs.input_ids + + max_new_tokens = 16 + + # 1. Generate CPU reference logits + logger.info("Loading CPU reference model for logit validation...") + from transformers import Qwen2_5_VLForConditionalGeneration + + cpu_model = Qwen2_5_VLForConditionalGeneration.from_pretrained( + MODEL_PATH, torch_dtype=torch.float32 + ) + cpu_model.eval() + + cpu_gen_config = GenerationConfig( + do_sample=False, + max_new_tokens=max_new_tokens, + return_dict_in_generate=True, + output_scores=True, + bos_token_id=BOS_TOKEN_ID, + eos_token_id=[EOS_TOKEN_ID], + pad_token_id=PAD_TOKEN_ID, + ) + + with torch.no_grad(): + cpu_result = cpu_model.generate(input_ids, generation_config=cpu_gen_config) + expected_logits = torch.stack(cpu_result.scores) # (seq_len, batch, vocab) + logger.info("CPU reference logits shape: %s", expected_logits.shape) + + # Clean up CPU model to free memory + del cpu_model + torch.cuda.empty_cache() if torch.cuda.is_available() else None + + # 2. Build Neuron generate_fn + neuron_gen_config = GenerationConfig( + do_sample=False, + max_new_tokens=max_new_tokens, + return_dict_in_generate=True, + output_scores=True, + bos_token_id=BOS_TOKEN_ID, + eos_token_id=[EOS_TOKEN_ID], + pad_token_id=PAD_TOKEN_ID, + ) + + def generate_fn(input_ids_list): + input_tensor = torch.tensor(input_ids_list) + attention_mask = torch.ones_like(input_tensor) + result = adapter.generate( + input_tensor, + attention_mask=attention_mask, + sampling_params=sampling_params, + generation_config=neuron_gen_config, + ) + return torch.stack(result.scores) + + # 3. Validate + passed = logit_validation( + input_ids=input_ids.tolist(), + generate_fn=generate_fn, + expected_logits=expected_logits, + ) + assert passed, "Logit validation failed: Neuron logits diverged from CPU reference" + logger.info("PASS: logit_validation -- Neuron logits match CPU reference") + + +def test_vl_generation( + model_and_adapter, processor, generation_config, sampling_params +): + """Test vision-language generation with a synthetic image.""" + _, adapter = model_and_adapter + image = make_test_image(448, 448, "green", "circle") + output, num_tokens, elapsed = generate_vl( + adapter, + processor, + "What shape and color do you see? Be brief.", + image, + generation_config, + sampling_params, + max_new_tokens=32, + ) + logger.info("VL output (%d tokens, %.2fs): %s", num_tokens, elapsed, output) + + assert num_tokens > 0, "Should generate at least one token" + output_lower = output.lower() + assert "green" in output_lower or "circle" in output_lower, ( + f"Expected 'green' or 'circle' in output, got: {output}" + ) + + +def test_vl_different_resolutions( + model_and_adapter, processor, generation_config, sampling_params +): + """Test VL generation works across different image resolutions.""" + _, adapter = model_and_adapter + + resolutions = [(224, 224), (448, 448), (672, 672), (640, 480)] + for w, h in resolutions: + image = make_test_image(w, h, "blue", "rectangle") + output, num_tokens, elapsed = generate_vl( + adapter, + processor, + "What do you see?", + image, + generation_config, + sampling_params, + max_new_tokens=16, + ) + assert num_tokens > 0, f"Failed for resolution {w}x{h}" + logger.info( + " %dx%d: %d tokens, %.2fs -- %s", w, h, num_tokens, elapsed, output[:60] + ) + + logger.info("PASS: All %d resolutions produced output", len(resolutions)) + + +def test_performance_ttft( + model_and_adapter, processor, generation_config, sampling_params +): + """Measure Time To First Token for text-only input.""" + _, adapter = model_and_adapter + + # Warmup + generate_text( + adapter, + processor, + "Hello", + generation_config, + sampling_params, + max_new_tokens=2, + ) + + # Measure + times = [] + for _ in range(5): + _, _, elapsed = generate_text( + adapter, + processor, + "Hello", + generation_config, + sampling_params, + max_new_tokens=1, + ) + times.append(elapsed * 1000) # ms + + avg_ttft = sum(times) / len(times) + logger.info("TTFT: %.1f ms (avg of %d runs)", avg_ttft, len(times)) + assert avg_ttft < 500, f"TTFT {avg_ttft:.1f}ms exceeds 500ms threshold" + + +def test_performance_throughput( + model_and_adapter, processor, generation_config, sampling_params +): + """Measure token generation throughput.""" + _, adapter = model_and_adapter + num_tokens_target = 64 + + # Warmup + generate_text( + adapter, + processor, + "Hello", + generation_config, + sampling_params, + max_new_tokens=4, + ) + + _, num_tokens, elapsed = generate_text( + adapter, + processor, + "Write a short paragraph about machine learning.", + generation_config, + sampling_params, + max_new_tokens=num_tokens_target, + ) + throughput = num_tokens / elapsed if elapsed > 0 else 0 + logger.info( + "Throughput: %.1f tok/s (%d tokens in %.2fs)", throughput, num_tokens, elapsed + ) + assert throughput > 10, ( + f"Throughput {throughput:.1f} tok/s below 10 tok/s threshold" + ) + + +# ---- Main (for running outside pytest) ---- + +if __name__ == "__main__": + print("=" * 80) + print("Qwen2.5-VL-7B-Instruct Integration Tests") + print("=" * 80) + print(f"Model path: {MODEL_PATH}") + print(f"Compiled path: {COMPILED_MODEL_PATH}") + print(f"TP degree: {TP_DEGREE}") + print() + print("Run with: pytest test/integration/test_model.py -v --capture=tee-sys") + print("=" * 80) diff --git a/contrib/models/Qwen2.5-VL-32B-Instruct/test/unit/__init__.py b/contrib/models/Qwen2.5-VL-7B-Instruct/test/unit/__init__.py similarity index 100% rename from contrib/models/Qwen2.5-VL-32B-Instruct/test/unit/__init__.py rename to contrib/models/Qwen2.5-VL-7B-Instruct/test/unit/__init__.py