Add mm_processor_args for Qwen3-VL (#4196)

CUHKSZzxy · web-flow · commit a34db184da3b · 2025-12-10T19:41:22.000+08:00
* add mm processor args

* move args passing pos

* fix None case

* remove deprecated env

* add for /generate

* fix arg no
diff --git a/lmdeploy/pytorch/envs.py b/lmdeploy/pytorch/envs.py
@@ -125,7 +125,6 @@ def _patched_get_env(
     # dlblas
     # we don't need to read this, it would be passed to ray workers
     # If Ray is launched from outside, it may fail to access the environment variables.
-    os.getenv('DEEPEP_MAX_BATCH_SIZE', None)
     os.getenv('DEEPEP_MAX_TOKENS_PER_RANK', None)
     os.getenv('DEEPEP_ENABLE_MNNVL', None)
     os.getenv('DEEPEP_MODE', 'auto')
diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py
@@ -769,6 +769,7 @@ async def generate(
             input_ids: Optional[List] = None,
             enable_thinking: Optional[bool] = None,
             add_vision_id: Optional[bool] = False,
+            mm_processor_kwargs: Optional[Dict[str, Any]] = None,
             **kwargs):
         """Generate responses.
 
@@ -821,6 +822,7 @@ async def generate(
                                                         reasoning_effort=reasoning_effort,
                                                         enable_thinking=enable_thinking,
                                                         add_vision_id=add_vision_id,
+                                                        mm_processor_kwargs=mm_processor_kwargs,
                                                         **kwargs)
             prompt = prompt_input['prompt']
             input_ids = prompt_input['input_ids']
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
@@ -470,7 +470,7 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque
         adapter_name=adapter_name,
         enable_thinking=request.enable_thinking,
         add_vision_id=request.add_vision_id,
-    )
+        mm_processor_kwargs=request.mm_processor_kwargs)
 
     def create_stream_response_json(index: int,
                                     delta_message: DeltaMessage,
@@ -911,7 +911,6 @@ async def _inner_call(i, generator):
 
 @router.post('/generate', dependencies=[Depends(check_api_key)])
 async def generate(request: GenerateReqInput, raw_request: Request = None):
-
     if request.session_id == -1:
         VariableInterface.session_id += 1
         request.session_id = VariableInterface.session_id
@@ -965,7 +964,7 @@ async def generate(request: GenerateReqInput, raw_request: Request = None):
         sequence_start=True,
         sequence_end=True,
         do_preprocess=False,
-    )
+        mm_processor_kwargs=request.mm_processor_kwargs)
 
     def create_generate_response_json(res, text, output_ids, logprobs, finish_reason, routed_experts=None):
         # only output router experts in last chunk
diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py
@@ -153,6 +153,11 @@ class ChatCompletionRequest(BaseModel):
     add_vision_id: Optional[bool] = False
     return_token_ids: Optional[bool] = False
     include_stop_str_in_output: Optional[bool] = False
+    # kwargs for hf processor
+    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
+        default=None,
+        description=('Additional kwargs to pass to the HF processor'),
+    )
 
 
 class FunctionCall(BaseModel):
diff --git a/lmdeploy/serve/vl_async_engine.py b/lmdeploy/serve/vl_async_engine.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 import asyncio
-from typing import Dict, List, Literal, Optional, Tuple, Union
+from typing import Any, Dict, List, Literal, Optional, Tuple, Union
 
 import PIL
 
@@ -57,6 +57,7 @@ async def _get_prompt_input(self,
                                 tools: Optional[List[object]] = None,
                                 enable_thinking: Optional[bool] = None,
                                 add_vision_id: Optional[bool] = False,
+                                mm_processor_kwargs: Optional[Dict[str, Any]] = None,
                                 **kwargs):
         """Process messages and return the required data for the inference
         engines.
@@ -91,7 +92,7 @@ async def _get_prompt_input(self,
 
         chat_template = self.chat_template if do_preprocess else BaseChatTemplate()
         messages = await self.async_convert_to_pil_images(messages)
-        results = await self.vl_encoder.preprocess(messages)
+        results = await self.vl_encoder.preprocess(messages, mm_processor_kwargs)
         if self.backend == 'turbomind':
             # for tm engine, this module perform vision embedding after image
             # preprocessing. It utilizes the hf model's vision embeddings
diff --git a/lmdeploy/vl/engine.py b/lmdeploy/vl/engine.py
@@ -1,8 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 import asyncio
+import inspect
 from concurrent.futures import ThreadPoolExecutor
-from typing import Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 import torch
 
@@ -23,6 +24,11 @@ def _raise_exception_on_finish(task: asyncio.Task) -> None:
         raise e
 
 
+def _accepts_arg(func, arg_name: str) -> bool:
+    """Check if a function accepts a specific keyword argument."""
+    return arg_name in inspect.signature(func).parameters
+
+
 class ImageEncoder:
     """Image encoder."""
 
@@ -41,9 +47,15 @@ def __init__(
         self.executor = ThreadPoolExecutor(max_workers=1)
         torch.cuda.empty_cache()
 
-    async def preprocess(self, messages: List[Dict]) -> List[Dict]:
+    async def preprocess(self,
+                         messages: List[Dict],
+                         mm_processor_kwargs: Optional[Dict[str, Any]] = None) -> List[Dict]:
         """Preprocess multimodal data in the messages."""
-        future = asyncio.get_event_loop().run_in_executor(self.executor, self.model.preprocess, messages)
+        if _accepts_arg(self.model.preprocess, 'mm_processor_kwargs'):
+            future = asyncio.get_event_loop().run_in_executor(self.executor, self.model.preprocess, messages,
+                                                              mm_processor_kwargs)
+        else:
+            future = asyncio.get_event_loop().run_in_executor(self.executor, self.model.preprocess, messages)
         future.add_done_callback(_raise_exception_on_finish)
         outputs = await future
         return outputs
diff --git a/lmdeploy/vl/model/qwen3.py b/lmdeploy/vl/model/qwen3.py
@@ -1,7 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict, List, Optional
+from typing import Any, Dict, List, Optional
 
 import torch
+from transformers import AutoProcessor
 
 from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
 
@@ -22,14 +23,17 @@ class Qwen3VLModel(VisionModel):
 
     def build_preprocessor(self):
         check_transformers()
-        from transformers import AutoProcessor
         self.processor = AutoProcessor.from_pretrained(self.model_path)
         tokenizer = self.processor.tokenizer
         self.image_token = self.processor.image_token
         self.image_token_id = tokenizer.encode(self.image_token)[-1]
+        self.mm_processor_kwargs = None
 
-    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+    def preprocess(self, messages: List[Dict], mm_processor_kwargs: Optional[Dict[str, Any]] = None) -> List[Dict]:
         """Refer to `super().preprocess()` for spec."""
+        if mm_processor_kwargs is None:
+            mm_processor_kwargs = {}
+
         images = self.collect_images(messages)
         optional_keys = {'resized_height', 'resized_width', 'min_pixels', 'max_pixels'}
         outputs = []
@@ -38,7 +42,10 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
 
             item = dict(type='image', image=image)
             item.update({key: params[key] for key in params.keys() if key in optional_keys})
-            result = self.processor.image_processor(images=image, videos=None, return_tensors='pt')
+            result = self.processor.image_processor(images=image,
+                                                    videos=None,
+                                                    return_tensors='pt',
+                                                    **mm_processor_kwargs)
             merge_length = self.processor.image_processor.merge_size**2
             image_tokens = result['image_grid_thw'].prod(dim=1) // merge_length
             result.update(dict(image_size=image.size, image_tokens=image_tokens, image_token_id=self.image_token_id))