diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 3d07e6b612ca..c6d7f19cbe90 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -499,8 +499,6 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): model to perform tasks that involve both image and text inputs. """ - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ # mapping for new names in checkpoint saved after transformers v4.52 diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index 0ada2ed5028b..ee9e210a3240 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -318,8 +318,6 @@ def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int: dummy_inputs=AyaVisionDummyInputsBuilder, ) class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ # mapping for new names in checkpoint saved after transformers v4.52 diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index f71b9c01d359..1244f97a1bd6 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -523,8 +523,6 @@ def _get_prompt_updates( class Blip2ForConditionalGeneration( nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant ): - merge_by_field_config = True - @classmethod def get_placeholder_str(cls, modality: str, i: int) -> str | None: if modality.startswith("image"): diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 3aa01bb1905f..dfc05a366b28 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -918,8 +918,6 @@ def forward( class ChameleonForConditionalGeneration( nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant ): - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index b8af3050990b..22f3ecad748e 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -784,7 +784,6 @@ class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant): is_pooling_model = True packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]} - merge_by_field_config = True @classmethod def get_placeholder_str(cls, modality: str, i: int) -> str | None: diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py index 139ccba9df6d..07dc7a01dc31 100644 --- a/vllm/model_executor/models/cohere2_vision.py +++ b/vllm/model_executor/models/cohere2_vision.py @@ -331,8 +331,6 @@ def get_replacement(item_idx: int): dummy_inputs=Cohere2VisionDummyInputsBuilder, ) class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ "model.vision_tower.": "vision_tower.", diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py index a612ebd95628..1f07381c0cbd 100644 --- a/vllm/model_executor/models/deepseek_ocr.py +++ b/vllm/model_executor/models/deepseek_ocr.py @@ -344,8 +344,6 @@ def get_replacement_deepseek_vl2(item_idx: int): dummy_inputs=DeepseekOCRDummyInputsBuilder, ) class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ # map prefix for language backbone diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 56c1a87a2540..9f8faf9ed91c 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -344,8 +344,6 @@ def _cached_apply_hf_processor( dummy_inputs=DeepseekVL2DummyInputsBuilder, ) class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ "language.": "language_model.", diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py index 5cc2a48f26d6..da19d8fdb15e 100644 --- a/vllm/model_executor/models/dots_ocr.py +++ b/vllm/model_executor/models/dots_ocr.py @@ -690,8 +690,6 @@ def forward( dummy_inputs=DotsOCRDummyInputsBuilder, ) class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_substr={ ".attn.qkv_proj.": ".attn.qkv.", diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index 81663dd7bbb4..3305b6a0e58f 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -1254,8 +1254,6 @@ def get_dummy_mm_data( class Ernie4_5_VLMoeForConditionalGeneration( nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE ): - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": [ "q_proj", diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 269c36ab5b9c..8a7a3dd771c3 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -260,8 +260,6 @@ def get_replacement_fuyu(item_idx: int): dummy_inputs=FuyuDummyInputsBuilder, ) class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ "model.vision_embed_tokens.": "vision_embed_tokens.", diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index 43c69e5e1399..e8dec36a1c5b 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -483,8 +483,6 @@ def forward(self, vision_outputs: torch.Tensor): class Gemma3ForConditionalGeneration( nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA ): - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": [ "q_proj", diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index 6ae76976eb46..7036118ada08 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -463,7 +463,6 @@ def forward( class Gemma3nForConditionalGeneration( nn.Module, SupportsMultiModal, SupportsTranscription ): - merge_by_field_config = True supported_languages = ISO639_1_SUPPORTED_LANGS packed_modules_mapping = { diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 5ba3c0a35928..3cb53f2cbabe 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -1424,8 +1424,6 @@ def get_video_replacement_glm4v(item_idx: int): class Glm4vForConditionalGeneration( nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE ): - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": [ "q_proj", diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index 514082cf60ce..ec5af94e297c 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -561,8 +561,6 @@ def get_replacement(item_idx: int): class GLM4VForCausalLM( ChatGLMBaseModel, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE ): - merge_by_field_config = True - packed_modules_mapping = { "query_key_value": ["query_key_value"], "dense_h_to_4h": ["dense_h_to_4h"], diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index accf7e6ef2f4..a4e50f408628 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -564,7 +564,6 @@ class GraniteSpeechForConditionalGeneration( SupportsLoRA, SupportsTranscription, ): - merge_by_field_config = True supported_languages = ISO639_1_SUPPORTED_LANGS packed_modules_mapping = { diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py index 5aef09ca9c25..52ce9564c8d7 100644 --- a/vllm/model_executor/models/hunyuan_vision.py +++ b/vllm/model_executor/models/hunyuan_vision.py @@ -786,7 +786,6 @@ class HunYuanVLForConditionalGeneration( SupportsQuant, SupportsXDRoPE, ): - merge_by_field_config = True multimodal_cpu_fields = {"image_grid_thw"} # To ensure correct weight loading and mapping. diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py index db46353efde5..3a083870e4b5 100644 --- a/vllm/model_executor/models/hyperclovax_vision.py +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -592,8 +592,6 @@ def build_mlp( dummy_inputs=HCXVisionDummyInputsBuilder, ) class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 7c3933c6feb7..0eed46448786 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -576,8 +576,6 @@ def forward( dummy_inputs=Idefics3DummyInputsBuilder, ) class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA): - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": [ "q_proj", diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 01b3e7827424..416ab236cd18 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -78,9 +78,9 @@ class SupportsMultiModal(Protocol): `multimodal_config.mm_encoder_tp_mode="data"`. """ - merge_by_field_config: ClassVar[bool] = True + merge_by_field_config: ClassVar[bool | None] = None """ - A flag that indicates which implementation of + [DEPRECATED] A flag that indicates which implementation of `vllm.multimodal.utils.group_mm_kwargs_by_modality` to use. """ @@ -260,7 +260,26 @@ def supports_multimodal(model: object) -> TypeIs[SupportsMultiModal]: ... def supports_multimodal( model: type[object] | object, ) -> TypeIs[type[SupportsMultiModal]] | TypeIs[SupportsMultiModal]: - return getattr(model, "supports_multimodal", False) + res = getattr(model, "supports_multimodal", False) + + if res: + # We can remove this starting from v0.14 + merge_by_field_config = getattr(model, "merge_by_field_config", None) + if merge_by_field_config is False: + raise ValueError( + "`merge_by_field_config=False` is no longer effective, " + "please update your model to consider the new batching logic " + "in `group_mm_kwargs_by_modality` (refer to " + "https://github.com/vllm-project/vllm/issues/26149), " + "and then remove the override from your model." + ) + if merge_by_field_config is True: + logger.warning_once( + "`merge_by_field_config=True` is redundant, " + "please remove the override from your model." + ) + + return res def supports_multimodal_raw_input_only(model: type[object] | object) -> bool: diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py index c2195fd0cb88..18985cefbf5e 100644 --- a/vllm/model_executor/models/interns1.py +++ b/vllm/model_executor/models/interns1.py @@ -509,8 +509,6 @@ def get_replacement_interns1_video(item_idx: int): class InternS1ForConditionalGeneration( nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA ): - merge_by_field_config = True - # To ensure correct weight loading and mapping. hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index fccddf3a6b29..15f7d4f418e4 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -1074,8 +1074,6 @@ def get_video_replacement_internvl(item_idx: int): dummy_inputs=InternVLDummyInputsBuilder, ) class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): - merge_by_field_config = True - supports_encoder_tp_data = True @classmethod diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 09acf8372e16..f31da0ee302b 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -1292,8 +1292,6 @@ def _get_mm_fields_config( class BaseKeyeModule(nn.Module): - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": [ "q_proj", diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py index 8167b82f3233..85267ccda8a9 100644 --- a/vllm/model_executor/models/kimi_vl.py +++ b/vllm/model_executor/models/kimi_vl.py @@ -298,8 +298,6 @@ def get_replacement(item_idx: int): dummy_inputs=KimiVLDummyInputsBuilder, ) class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - supports_encoder_tp_data = True @classmethod diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index c1fb2d4f4af7..66a327bb7603 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -506,8 +506,6 @@ def init_vision_tower_for_llava( dummy_inputs=LlavaDummyInputsBuilder, ) class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index b995cac47ac1..526846d0d981 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -223,8 +223,6 @@ def _get_mm_fields_config( dummy_inputs=LlavaDummyInputsBuilder, ) class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ # mapping for new names in checkpoint saved after transformers v4.52 diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index 902c598c226f..cd55cfec6cde 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -299,8 +299,6 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor: dummy_inputs=LlavaNextVideoDummyInputsBuilder, ) class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ # mapping for new names in checkpoint saved after transformers v4.52 diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 4e243ade6835..5aa8de7dc252 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -479,8 +479,6 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor: dummy_inputs=LlavaOnevisionDummyInputsBuilder, ) class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ # mapping for new names in checkpoint saved after transformers v4.52 diff --git a/vllm/model_executor/models/midashenglm.py b/vllm/model_executor/models/midashenglm.py index d9b23811730d..2d506978d266 100644 --- a/vllm/model_executor/models/midashenglm.py +++ b/vllm/model_executor/models/midashenglm.py @@ -683,8 +683,6 @@ def get_replacement_midashenglm(item_idx: int): dummy_inputs=MiDashengLMDummyInputsBuilder, ) class MiDashengLMModel(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": [ "q_proj", diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 6d0ebf5c9825..c45bdf95e748 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -1003,8 +1003,6 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP): instantiated. """ - merge_by_field_config = True - supports_encoder_tp_data = True @classmethod diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py index 0939a72ba53e..e480454953df 100644 --- a/vllm/model_executor/models/minimax_vl_01.py +++ b/vllm/model_executor/models/minimax_vl_01.py @@ -179,8 +179,6 @@ def _get_mm_fields_config( dummy_inputs=MiniMaxVL01DummyInputsBuilder, ) class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index 1ddb470a0f93..e9161e69e731 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -423,8 +423,6 @@ def init_vision_tower_for_llava( class Mistral3ForConditionalGeneration( nn.Module, SupportsLoRA, SupportsMultiModal, SupportsPP ): - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 286859d188d3..e944c0ee38aa 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -741,8 +741,6 @@ class Llama4ForConditionalGeneration( SupportsEagle3, SupportsLoRA, ): - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 7b53299cccbe..a6cd9ad16c18 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -1354,8 +1354,6 @@ def get_insertion_molmo(item_idx: int): class MolmoForCausalLM( nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, SupportsQuant ): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_substr={ # vision backbone mapping diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index c4198d36b392..6dfab595e5b9 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -1116,8 +1116,6 @@ def get_dummy_mm_data( class NemotronH_Nano_VL_V2( nn.Module, HasInnerState, IsHybrid, SupportsMultiModal, SupportsMultiModalPruning ): - merge_by_field_config = True - @classmethod def get_placeholder_str(cls, modality: str, i: int) -> str | None: if modality.startswith("image"): diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py index a57668b21fb8..391980fc61f9 100644 --- a/vllm/model_executor/models/nemotron_vl.py +++ b/vllm/model_executor/models/nemotron_vl.py @@ -358,8 +358,6 @@ def get_image_processor(self, **kwargs: object): dummy_inputs=BaseInternVLDummyInputsBuilder[NemotronVLProcessingInfo], ) class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): - merge_by_field_config = True - @classmethod def get_placeholder_str(cls, modality: str, i: int) -> str | None: if modality.startswith("image"): diff --git a/vllm/model_executor/models/opencua.py b/vllm/model_executor/models/opencua.py index b92f0c9dac32..76a2d1cc242c 100644 --- a/vllm/model_executor/models/opencua.py +++ b/vllm/model_executor/models/opencua.py @@ -201,7 +201,6 @@ def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: dummy_inputs=OpenCUADummyInputsBuilder, ) class OpenCUAForConditionalGeneration(Qwen2_5_VLForConditionalGeneration): - merge_by_field_config = True multimodal_cpu_fields = {"image_grid_thw"} packed_modules_mapping = { diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index a0fab820720f..0691bbc615be 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -414,8 +414,6 @@ def get_replacement_ovis(item_idx: int): dummy_inputs=OvisDummyInputsBuilder, ) class Ovis(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - @classmethod def get_placeholder_str(cls, modality: str, i: int) -> str | None: if modality.startswith("image"): diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py index 85f37cfea10b..0ad22aab748e 100644 --- a/vllm/model_executor/models/ovis2_5.py +++ b/vllm/model_executor/models/ovis2_5.py @@ -456,8 +456,6 @@ def get_replacement_ovis(item_idx, modality: str): dummy_inputs=Ovis2_5DummyInputsBuilder, ) class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py index 1df5ff62fa5b..9703a5b417d0 100644 --- a/vllm/model_executor/models/paddleocr_vl.py +++ b/vllm/model_executor/models/paddleocr_vl.py @@ -1103,8 +1103,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: dummy_inputs=PaddleOCRVLDummyInputsBuilder, ) class PaddleOCRVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsMRoPE): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ "model.": "language_model.model.", diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 9fa32f01d37a..67240c6e7124 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -251,8 +251,6 @@ def apply( dummy_inputs=PaliGemmaDummyInputsBuilder, ) class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": [ "q_proj", diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 384572217bc1..b7ae548069f2 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -562,8 +562,6 @@ def _apply_prompt_updates( dummy_inputs=Phi3VDummyInputsBuilder, ) class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ "model.vision_embed_tokens.wte": "embed_tokens", diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 8425549a7bd2..179d5df869be 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -984,8 +984,6 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): Implements the Phi-4-multimodal-instruct model in vLLM. """ - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": [ "qkv_proj", diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index cad241842cd3..faf2d80d24bb 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -365,8 +365,6 @@ def _cached_apply_hf_processor( dummy_inputs=PixtralDummyInputsBuilder, ) class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - @classmethod def get_placeholder_str(cls, modality: str, i: int) -> str | None: if modality.startswith("image"): diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index 1ce0fb4e4d93..3438406c4fac 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -773,8 +773,6 @@ class Qwen2_5OmniThinkerForConditionalGeneration( SupportsMRoPE, Qwen2_5OmniConditionalGenerationMixin, ): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ "thinker.lm_head.": "language_model.lm_head.", diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index cb521ebdf0af..488af192bfad 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -1039,7 +1039,6 @@ class Qwen2_5_VLForConditionalGeneration( SupportsMultiModalPruning, SupportsMRoPE, ): - merge_by_field_config = True multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"} packed_modules_mapping = { diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 7e883a393aa8..f84ddfa84f6a 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -313,8 +313,6 @@ def get_replacement_qwen2_audio(item_idx: int): dummy_inputs=Qwen2AudioDummyInputsBuilder, ) class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - @classmethod def get_placeholder_str(cls, modality: str, i: int) -> str | None: if modality.startswith("audio"): diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index b74876849841..9da5080f8430 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1131,7 +1131,6 @@ def _get_mm_fields_config( class Qwen2VLForConditionalGeneration( nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE ): - merge_by_field_config = True multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"} # To ensure correct weight loading and mapping. diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index e6979211b707..dbe7bcd07576 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -1131,8 +1131,6 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( SupportsMRoPE, Qwen3OmniMoeConditionalGenerationMixin, ): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ "thinker.lm_head.": "language_model.lm_head.", diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 58721303dfc8..a5b10c95872d 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -1190,7 +1190,6 @@ class Qwen3VLForConditionalGeneration( SupportsMRoPE, SupportsEagle3, ): - merge_by_field_config = True multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"} packed_modules_mapping = { diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py index 55680b8e7ddf..caac14716782 100644 --- a/vllm/model_executor/models/qwen_vl.py +++ b/vllm/model_executor/models/qwen_vl.py @@ -703,8 +703,6 @@ def _get_prompt_updates( class QwenVLForConditionalGeneration( QWenBaseModel, SupportsPP, SupportsLoRA, SupportsMultiModal ): - merge_by_field_config = True - packed_modules_mapping = { "c_attn": ["c_attn"], "gate_up_proj": [ diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 9db1423d98e0..2600dc1c9f79 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -989,7 +989,6 @@ class SiglipEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant): is_pooling_model = True packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]} - merge_by_field_config = True @classmethod def get_placeholder_str(cls, modality: str, i: int) -> str | None: diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py index 55c25ce6190f..f95fbffc1d0b 100644 --- a/vllm/model_executor/models/skyworkr1v.py +++ b/vllm/model_executor/models/skyworkr1v.py @@ -647,8 +647,6 @@ def get_replacement_skyworkr1v(item_idx: int): dummy_inputs=SkyworkR1VDummyInputsBuilder, ) class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - @classmethod def get_placeholder_str(cls, modality: str, i: int) -> str | None: if modality.startswith("image"): diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index 3e55ada0ed2e..e5038e56a270 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -916,8 +916,6 @@ def forward( dummy_inputs=Step3VLDummyInputsBuilder, ) class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ "model.": "language_model.model.", diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py index 4d310712f303..7e82a4d725a6 100644 --- a/vllm/model_executor/models/tarsier.py +++ b/vllm/model_executor/models/tarsier.py @@ -400,8 +400,6 @@ def _get_layer_index(feature_layer_index: int, num_hidden_layers_total: int) -> dummy_inputs=TarsierDummyInputsBuilder, ) class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], diff --git a/vllm/model_executor/models/terratorch.py b/vllm/model_executor/models/terratorch.py index 9f34090e3107..402081a70631 100644 --- a/vllm/model_executor/models/terratorch.py +++ b/vllm/model_executor/models/terratorch.py @@ -227,7 +227,6 @@ def apply( dummy_inputs=TerratorchInputBuilder, ) class Terratorch(nn.Module, IsAttentionFree, SupportsMultiModal): - merge_by_field_config = True supports_multimodal_raw_input_only = True is_pooling_model = True diff --git a/vllm/model_executor/models/transformers/multimodal.py b/vllm/model_executor/models/transformers/multimodal.py index ccf605371987..9d77dee2810c 100644 --- a/vllm/model_executor/models/transformers/multimodal.py +++ b/vllm/model_executor/models/transformers/multimodal.py @@ -264,7 +264,7 @@ def apply( class MultiModalMixin(SupportsMultiModal, SupportsMRoPE): supports_multimodal_raw_input_only = True - merge_by_field_config = True + # Backwards compatibility for prev released models. State dicts back then # had different formats and cannot be loaded with `AutoModel` mapping as is hf_to_vllm_mapper = WeightsMapper( diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 2444159b2ad6..32a2ba1ef38f 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -498,8 +498,6 @@ def forward( dummy_inputs=UltravoxDummyInputsBuilder, ) class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 45f8fa079c71..7b408248ec74 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -330,8 +330,6 @@ def _get_data_parser(self) -> MultiModalDataParser: class VoxtralForConditionalGeneration( nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, SupportsTranscription ): - merge_by_field_config = True - supported_languages = ISO639_1_SUPPORTED_LANGS packed_modules_mapping = { diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 0daf6bda61cc..b2feff133515 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -775,7 +775,6 @@ def _get_prompt_updates( class WhisperForConditionalGeneration( nn.Module, SupportsTranscription, SupportsMultiModal ): - merge_by_field_config = True packed_modules_mapping = { "self_attn.qkv_proj": [ "self_attn.q_proj", diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index f8e8847e8e60..9c5e3fb2b32a 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -426,7 +426,6 @@ def group_mm_kwargs_by_modality( Yields: A tuple `(modality, num_items, grouped_kwargs)`. """ - # TODO: After v0.13, remove merge_by_field_config attribute from model impls if merge_by_field_config is not None: logger.warning_once( "The `merge_by_field_config` argument of `group_mm_kwargs_by_modality` "