Fix processor args (#4200)

CUHKSZzxy · web-flow · commit 32f1f0c94cf4 · 2025-12-12T17:17:04.000+08:00
* fix mm processor args

* comment

* optimize

* add test case

* add comments

* fix typo

* fix generate

* warning

* remove
diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py
@@ -449,6 +449,11 @@ class GenerateReqInput(BaseModel):
     spaces_between_special_tokens: Optional[bool] = True
     include_stop_str_in_output: Optional[bool] = False
     return_routed_experts: Optional[bool] = False
+    # kwargs for hf processor
+    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
+        default=None,
+        description=('Additional kwargs to pass to the HF processor'),
+    )
 
 
 class GenerateReqMetaOutput(BaseModel):
diff --git a/lmdeploy/vl/model/qwen3.py b/lmdeploy/vl/model/qwen3.py
@@ -4,8 +4,11 @@
 import torch
 from transformers import AutoProcessor
 
+from lmdeploy.utils import get_logger
 from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
 
+logger = get_logger('lmdeploy')
+
 
 def check_transformers():
     try:
@@ -29,23 +32,59 @@ def build_preprocessor(self):
         self.image_token_id = tokenizer.encode(self.image_token)[-1]
         self.mm_processor_kwargs = None
 
+    def get_processor_args(self, mm_processor_kwargs: Optional[Dict[str, Any]] = None):
+        min_pixels = self.processor.image_processor.size['shortest_edge']
+        max_pixels = self.processor.image_processor.size['longest_edge']
+
+        if mm_processor_kwargs is None:
+            return min_pixels, max_pixels
+
+        input_min_pixels = mm_processor_kwargs.get('min_pixels', None)
+        input_max_pixels = mm_processor_kwargs.get('max_pixels', None)
+
+        # boundary check for min_pixels and max_pixels
+        if input_min_pixels is None:
+            if input_max_pixels is not None:
+                # only max_pixels is given in the input
+                if input_max_pixels < min_pixels:
+                    logger.warning(
+                        f'input max_pixels {input_max_pixels} < default min_pixels {min_pixels}, fall back to default.')
+                    return min_pixels, max_pixels
+                max_pixels = input_max_pixels
+        else:
+            if input_max_pixels is None:
+                # only min_pixels is given in the input
+                if input_min_pixels > max_pixels:
+                    logger.warning(
+                        f'input min_pixels {input_min_pixels} > default max_pixels {max_pixels}, fall back to default.')
+                    return min_pixels, max_pixels
+            else:
+                if input_min_pixels > input_max_pixels:
+                    logger.warning(
+                        f'input min_pixels {input_min_pixels} > max_pixels {input_max_pixels}, fall back to default.')
+                    return min_pixels, max_pixels
+                max_pixels = input_max_pixels
+            min_pixels = input_min_pixels
+
+        return min_pixels, max_pixels
+
     def preprocess(self, messages: List[Dict], mm_processor_kwargs: Optional[Dict[str, Any]] = None) -> List[Dict]:
         """Refer to `super().preprocess()` for spec."""
-        if mm_processor_kwargs is None:
-            mm_processor_kwargs = {}
+
+        min_pixels, max_pixels = self.get_processor_args(mm_processor_kwargs)
 
         images = self.collect_images(messages)
-        optional_keys = {'resized_height', 'resized_width', 'min_pixels', 'max_pixels'}
         outputs = []
         for image, params in images:
             image = image.convert('RGB')
 
-            item = dict(type='image', image=image)
-            item.update({key: params[key] for key in params.keys() if key in optional_keys})
             result = self.processor.image_processor(images=image,
                                                     videos=None,
-                                                    return_tensors='pt',
-                                                    **mm_processor_kwargs)
+                                                    size={
+                                                        'shortest_edge': min_pixels,
+                                                        'longest_edge': max_pixels
+                                                    },
+                                                    return_tensors='pt')
             merge_length = self.processor.image_processor.merge_size**2
             image_tokens = result['image_grid_thw'].prod(dim=1) // merge_length
             result.update(dict(image_size=image.size, image_tokens=image_tokens, image_token_id=self.image_token_id))
diff --git a/tests/test_lmdeploy/test_vl/test_qwen3vl_processor.py b/tests/test_lmdeploy/test_vl/test_qwen3vl_processor.py
@@ -0,0 +1,80 @@
+import copy
+
+import pytest
+
+from lmdeploy.vl.model.qwen3 import Qwen3VLModel
+from lmdeploy.vl.utils import load_image
+
+QWEN3VL_MODELS = [
+    'Qwen/Qwen3-VL-4B-Instruct',
+]
+
+IMAGE_URL = ('https://raw.githubusercontent.com/open-mmlab/'
+             'mmdeploy/main/tests/data/tiger.jpeg')
+
+
+@pytest.fixture(scope='module', params=QWEN3VL_MODELS)
+def qwen3vl_model(request):
+    """Initialize Qwen3VLModel with a real model path."""
+    model = Qwen3VLModel(model_path=request.param)
+    model.build_preprocessor()
+    return model
+
+
+@pytest.fixture
+def sample_messages():
+    """Create sample messages for preprocessing using image_url."""
+    pil_image = load_image(IMAGE_URL)
+    return [{
+        'role':
+        'user',
+        'content': [
+            {
+                'type': 'text',
+                'text': 'Can you describe this image?'
+            },
+            {
+                'type': 'image',
+                'image': pil_image
+            },
+        ]
+    }]
+
+
+def test_qwen3vl_preprocess_with_custom_pixels(qwen3vl_model, sample_messages):
+    """Test that mm_processor_kwargs with min/max pixels takes effect."""
+
+    # compression ratio for qwen3vl is 32 = patch_size * spatial_merge_size = 16 * 2
+    # qwen3vl_model.processor.image_processor.size['shortest_edge'] = 66536
+    # 65536 = 64 * 32 * 32, indicates 64 image token budget
+    # qwen3vl_model.processor.image_processor.size['longest_edge'] = 16777216
+    # 16777216 = 16384 * 32 * 32, indicates 16384 image token budget
+
+    # Default processing without custom arguments
+    default_processed_messages = qwen3vl_model.preprocess(messages=copy.deepcopy(sample_messages))
+    default_content = default_processed_messages[-1]['content']
+    default_shape = default_content[0]['pixel_values'].shape  # [280, 1536]
+
+    # Processing with smaller pixel range
+    mm_processor_kwargs = {'min_pixels': 10 * 32 * 32, 'max_pixels': 20 * 32 * 32}
+    custom_processed_messages = qwen3vl_model.preprocess(messages=copy.deepcopy(sample_messages),
+                                                         mm_processor_kwargs=mm_processor_kwargs)
+    custom_content = custom_processed_messages[-1]['content']
+    custom_shape = custom_content[0]['pixel_values'].shape  # [60, 1536]
+
+    assert default_shape != custom_shape, \
+        'Default and custom processing should result in different shapes.'
+    assert default_shape[0] > custom_shape[0], \
+        'Custom processing with smaller pixel range should result in smaller image size.'
+
+    # Processing with larger pixel range
+    mm_processor_kwargs = {'min_pixels': 100 * 32 * 32, 'max_pixels': 20000 * 32 * 32}
+    custom_processed_messages = qwen3vl_model.preprocess(messages=copy.deepcopy(sample_messages),
+                                                         mm_processor_kwargs=mm_processor_kwargs)
+    custom_content = custom_processed_messages[-1]['content']
+    custom_shape = custom_content[0]['pixel_values'].shape  # [468, 1536]
+
+    assert default_shape != custom_shape, \
+        'Default and custom processing should result in different shapes.'
+    assert default_shape[0] < custom_shape[0], \
+        'Custom processing with larger pixel range should result in larger image size.'