Skip to content

Commit 32f1f0c

Browse files
authored
Fix processor args (#4200)
* fix mm processor args * comment * optimize * add test case * add comments * fix typo * fix generate * warning * remove
1 parent eb04b42 commit 32f1f0c

File tree

3 files changed

+131
-7
lines changed

3 files changed

+131
-7
lines changed

lmdeploy/serve/openai/protocol.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,11 @@ class GenerateReqInput(BaseModel):
449449
spaces_between_special_tokens: Optional[bool] = True
450450
include_stop_str_in_output: Optional[bool] = False
451451
return_routed_experts: Optional[bool] = False
452+
# kwargs for hf processor
453+
mm_processor_kwargs: Optional[dict[str, Any]] = Field(
454+
default=None,
455+
description=('Additional kwargs to pass to the HF processor'),
456+
)
452457

453458

454459
class GenerateReqMetaOutput(BaseModel):

lmdeploy/vl/model/qwen3.py

Lines changed: 46 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,11 @@
44
import torch
55
from transformers import AutoProcessor
66

7+
from lmdeploy.utils import get_logger
78
from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
89

10+
logger = get_logger('lmdeploy')
11+
912

1013
def check_transformers():
1114
try:
@@ -29,23 +32,59 @@ def build_preprocessor(self):
2932
self.image_token_id = tokenizer.encode(self.image_token)[-1]
3033
self.mm_processor_kwargs = None
3134

35+
def get_processor_args(self, mm_processor_kwargs: Optional[Dict[str, Any]] = None):
36+
min_pixels = self.processor.image_processor.size['shortest_edge']
37+
max_pixels = self.processor.image_processor.size['longest_edge']
38+
39+
if mm_processor_kwargs is None:
40+
return min_pixels, max_pixels
41+
42+
input_min_pixels = mm_processor_kwargs.get('min_pixels', None)
43+
input_max_pixels = mm_processor_kwargs.get('max_pixels', None)
44+
45+
# boundary check for min_pixels and max_pixels
46+
if input_min_pixels is None:
47+
if input_max_pixels is not None:
48+
# only max_pixels is given in the input
49+
if input_max_pixels < min_pixels:
50+
logger.warning(
51+
f'input max_pixels {input_max_pixels} < default min_pixels {min_pixels}, fall back to default.')
52+
return min_pixels, max_pixels
53+
max_pixels = input_max_pixels
54+
else:
55+
if input_max_pixels is None:
56+
# only min_pixels is given in the input
57+
if input_min_pixels > max_pixels:
58+
logger.warning(
59+
f'input min_pixels {input_min_pixels} > default max_pixels {max_pixels}, fall back to default.')
60+
return min_pixels, max_pixels
61+
else:
62+
if input_min_pixels > input_max_pixels:
63+
logger.warning(
64+
f'input min_pixels {input_min_pixels} > max_pixels {input_max_pixels}, fall back to default.')
65+
return min_pixels, max_pixels
66+
max_pixels = input_max_pixels
67+
min_pixels = input_min_pixels
68+
69+
return min_pixels, max_pixels
70+
3271
def preprocess(self, messages: List[Dict], mm_processor_kwargs: Optional[Dict[str, Any]] = None) -> List[Dict]:
3372
"""Refer to `super().preprocess()` for spec."""
34-
if mm_processor_kwargs is None:
35-
mm_processor_kwargs = {}
73+
74+
min_pixels, max_pixels = self.get_processor_args(mm_processor_kwargs)
3675

3776
images = self.collect_images(messages)
38-
optional_keys = {'resized_height', 'resized_width', 'min_pixels', 'max_pixels'}
3977
outputs = []
4078
for image, params in images:
4179
image = image.convert('RGB')
4280

43-
item = dict(type='image', image=image)
44-
item.update({key: params[key] for key in params.keys() if key in optional_keys})
4581
result = self.processor.image_processor(images=image,
4682
videos=None,
47-
return_tensors='pt',
48-
**mm_processor_kwargs)
83+
size={
84+
'shortest_edge': min_pixels,
85+
'longest_edge': max_pixels
86+
},
87+
return_tensors='pt')
4988
merge_length = self.processor.image_processor.merge_size**2
5089
image_tokens = result['image_grid_thw'].prod(dim=1) // merge_length
5190
result.update(dict(image_size=image.size, image_tokens=image_tokens, image_token_id=self.image_token_id))
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
import copy
2+
3+
import pytest
4+
5+
from lmdeploy.vl.model.qwen3 import Qwen3VLModel
6+
from lmdeploy.vl.utils import load_image
7+
8+
QWEN3VL_MODELS = [
9+
'Qwen/Qwen3-VL-4B-Instruct',
10+
]
11+
12+
IMAGE_URL = ('https://raw.githubusercontent.com/open-mmlab/'
13+
'mmdeploy/main/tests/data/tiger.jpeg')
14+
15+
16+
@pytest.fixture(scope='module', params=QWEN3VL_MODELS)
17+
def qwen3vl_model(request):
18+
"""Initialize Qwen3VLModel with a real model path."""
19+
model = Qwen3VLModel(model_path=request.param)
20+
model.build_preprocessor()
21+
return model
22+
23+
24+
@pytest.fixture
25+
def sample_messages():
26+
"""Create sample messages for preprocessing using image_url."""
27+
pil_image = load_image(IMAGE_URL)
28+
return [{
29+
'role':
30+
'user',
31+
'content': [
32+
{
33+
'type': 'text',
34+
'text': 'Can you describe this image?'
35+
},
36+
{
37+
'type': 'image',
38+
'image': pil_image
39+
},
40+
]
41+
}]
42+
43+
44+
def test_qwen3vl_preprocess_with_custom_pixels(qwen3vl_model, sample_messages):
45+
"""Test that mm_processor_kwargs with min/max pixels takes effect."""
46+
47+
# compression ratio for qwen3vl is 32 = patch_size * spatial_merge_size = 16 * 2
48+
# qwen3vl_model.processor.image_processor.size['shortest_edge'] = 66536
49+
# 65536 = 64 * 32 * 32, indicates 64 image token budget
50+
# qwen3vl_model.processor.image_processor.size['longest_edge'] = 16777216
51+
# 16777216 = 16384 * 32 * 32, indicates 16384 image token budget
52+
53+
# Default processing without custom arguments
54+
default_processed_messages = qwen3vl_model.preprocess(messages=copy.deepcopy(sample_messages))
55+
default_content = default_processed_messages[-1]['content']
56+
default_shape = default_content[0]['pixel_values'].shape # [280, 1536]
57+
58+
# Processing with smaller pixel range
59+
mm_processor_kwargs = {'min_pixels': 10 * 32 * 32, 'max_pixels': 20 * 32 * 32}
60+
custom_processed_messages = qwen3vl_model.preprocess(messages=copy.deepcopy(sample_messages),
61+
mm_processor_kwargs=mm_processor_kwargs)
62+
custom_content = custom_processed_messages[-1]['content']
63+
custom_shape = custom_content[0]['pixel_values'].shape # [60, 1536]
64+
65+
assert default_shape != custom_shape, \
66+
'Default and custom processing should result in different shapes.'
67+
assert default_shape[0] > custom_shape[0], \
68+
'Custom processing with smaller pixel range should result in smaller image size.'
69+
70+
# Processing with larger pixel range
71+
mm_processor_kwargs = {'min_pixels': 100 * 32 * 32, 'max_pixels': 20000 * 32 * 32}
72+
custom_processed_messages = qwen3vl_model.preprocess(messages=copy.deepcopy(sample_messages),
73+
mm_processor_kwargs=mm_processor_kwargs)
74+
custom_content = custom_processed_messages[-1]['content']
75+
custom_shape = custom_content[0]['pixel_values'].shape # [468, 1536]
76+
77+
assert default_shape != custom_shape, \
78+
'Default and custom processing should result in different shapes.'
79+
assert default_shape[0] < custom_shape[0], \
80+
'Custom processing with larger pixel range should result in larger image size.'

0 commit comments

Comments
 (0)