InternLM
diff --git a/‎lmdeploy/model.py‎
Lines changed: 3 additions & 6 deletions b/‎lmdeploy/model.py‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎lmdeploy/serve/async_engine.py‎
Lines changed: 17 additions & 8 deletions b/‎lmdeploy/serve/async_engine.py‎
Lines changed: 17 additions & 8 deletions
diff --git a/‎lmdeploy/serve/openai/api_server.py‎
Lines changed: 12 additions & 5 deletions b/‎lmdeploy/serve/openai/api_server.py‎
Lines changed: 12 additions & 5 deletions
diff --git a/‎lmdeploy/serve/openai/protocol.py‎
Lines changed: 6 additions & 2 deletions b/‎lmdeploy/serve/openai/protocol.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎lmdeploy/serve/vl_async_engine.py‎
Lines changed: 5 additions & 10 deletions b/‎lmdeploy/serve/vl_async_engine.py‎
Lines changed: 5 additions & 10 deletions
diff --git a/‎lmdeploy/vl/engine.py‎
Lines changed: 4 additions & 8 deletions b/‎lmdeploy/vl/engine.py‎
Lines changed: 4 additions & 8 deletions
diff --git a/‎lmdeploy/vl/model/base.py‎
Lines changed: 6 additions & 2 deletions b/‎lmdeploy/vl/model/base.py‎
Lines changed: 6 additions & 2 deletions
@@ -768,18 +768,16 @@ def messages2prompt(self, messages, sequence_start=True, **kwargs):
             'Each message should be a dict with "role" and "content" keys.'
 
         if 'enable_thinking' in kwargs and kwargs['enable_thinking'] is None:
-            # Workaround for internlm/Intern-S1: the chat template expects a <think> tag appended,
-            # but when enable_thinking=None is specified, the <think> tag is omitted.
+            # Workaround for internlm/Intern-S1: when enable_thinking=None passed apply_chat_template,
+            # the <think> tag is not generated.
             kwargs.pop('enable_thinking')
-        if 'reasoning_effort' in kwargs and kwargs.get('reasoning_effort', None) is None:
+        if 'reasoning_effort' in kwargs and kwargs['reasoning_effort'] is None:
             kwargs.pop('reasoning_effort')
-        add_vision_id = kwargs.pop('add_vision_id', False)
         add_generation_prompt = messages[-1]['role'] != 'assistant'
         if sequence_start:
             prompt = self.tokenizer.apply_chat_template(messages,
                                                         tokenize=False,
                                                         add_generation_prompt=add_generation_prompt,
-                                                        add_vision_id=add_vision_id,
                                                         **kwargs)
         else:
             # Use a sentinel position to avoid the influence of default system role in the tokenizer's chat template
@@ -790,7 +788,6 @@ def messages2prompt(self, messages, sequence_start=True, **kwargs):
             prompt = self.tokenizer.apply_chat_template(sentinel_messages + messages,
                                                         tokenize=False,
                                                         add_generation_prompt=add_generation_prompt,
-                                                        add_vision_id=add_vision_id,
                                                         **kwargs)
             # remove the sentinel part
             prompt = prompt[len(sentinel_prompt):]
 
@@ -196,13 +196,15 @@ def __call__(self,
                  gen_config: Optional[GenerationConfig] = None,
                  stream_response: bool = True,
                  do_preprocess: bool = True,
-                 adapter_name: str = None) -> Union[Response, Iterator[Response]]:
+                 adapter_name: str = None,
+                 **kwargs) -> Union[Response, Iterator[Response]]:
         self._engine.chat(prompt,
                           gen_config=gen_config or self._gen_config,
                           stream_response=stream_response,
                           do_preprocess=do_preprocess,
                           session=self,
-                          adapter_name=adapter_name)
+                          adapter_name=adapter_name,
+                          **kwargs)
         if stream_response:
             return self.generator
         else:
@@ -691,7 +693,7 @@ async def _get_prompt_input(self,
                                 adapter_name: str,
                                 tools: Optional[List[object]] = None,
                                 reasoning_effort: Optional[Literal['low', 'medium', 'high']] = None,
-                                enable_thinking: Optional[bool] = None,
+                                chat_template_kwargs: Optional[Dict] = None,
                                 **kwargs):
         # Change multimodal data to openai text messages, i.e.,
         # [{'role': 'user', 'content': [{'type': 'text', 'text': 'hi'}]}] ->
@@ -706,12 +708,12 @@ async def _get_prompt_input(self,
                 chat_template = MODELS.module_dict[adapter_name]()
         else:
             chat_template = BaseChatTemplate()
+        chat_template_kwargs = chat_template_kwargs or {}
         prompt = chat_template.messages2prompt(prompt,
                                                sequence_start,
                                                tools=tools,
-                                               enable_thinking=enable_thinking,
                                                reasoning_effort=reasoning_effort,
-                                               **kwargs)
+                                               **chat_template_kwargs)
         if prompt is None:
             raise ValueError(
                 f'You are using base template to handle chat task. Please specify a `--chat-template` name chosen from `lmdeploy list` if you want to use OpenAI messages input.'  # noqa
@@ -768,7 +770,7 @@ async def generate(
             rewind_stop_tokens: bool = False,
             input_ids: Optional[List] = None,
             enable_thinking: Optional[bool] = None,
-            add_vision_id: Optional[bool] = False,
+            chat_template_kwargs: Optional[Dict] = None,
             mm_processor_kwargs: Optional[Dict[str, Any]] = None,
             **kwargs):
         """Generate responses.
@@ -811,6 +813,14 @@ async def generate(
         if gen_config.n > 1:
             logger.warning(f'n({gen_config.n}) > 1 hasn\'t been supported yet. Fallback to 1')
             gen_config.n = 1
+        chat_template_kwargs = chat_template_kwargs or {}
+        if enable_thinking is not None:
+            logger.warning('enable_thinking is deprecated, use chat_template_kwargs["enable_thinking"] instead')
+            if chat_template_kwargs.get('enable_thinking') is None:
+                chat_template_kwargs['enable_thinking'] = enable_thinking
+            else:
+                logger.warning('chat_template_kwargs["enable_thinking"] is already set, '
+                               'the value will not be overwritten by enable_thinking')
         if messages:
             prompt = messages
             self.request_logger.log_prompt(session_id=session_id, prompt=prompt)
@@ -820,9 +830,8 @@ async def generate(
                                                         adapter_name,
                                                         tools=tools,
                                                         reasoning_effort=reasoning_effort,
-                                                        enable_thinking=enable_thinking,
-                                                        add_vision_id=add_vision_id,
                                                         mm_processor_kwargs=mm_processor_kwargs,
+                                                        chat_template_kwargs=chat_template_kwargs,
                                                         **kwargs)
             prompt = prompt_input['prompt']
             input_ids = prompt_input['input_ids']
 
@@ -457,6 +457,15 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque
                 tools = [item.function.model_dump() for item in request.tools]
     # text completion for string input
     do_preprocess = False if isinstance(request.messages, str) else request.do_preprocess
+    chat_template_kwargs = request.chat_template_kwargs or {}
+    if request.enable_thinking is not None:
+        logger.warning('`enable_thinking` will be deprecated in the future, '
+                       'please use `chat_template_kwargs` instead.')
+        if chat_template_kwargs.get('enable_thinking') is None:
+            chat_template_kwargs['enable_thinking'] = request.enable_thinking
+        else:
+            logger.warning('`enable_thinking` in `chat_template_kwargs` will override the value in request.')
+    enable_thinking = chat_template_kwargs.get('enable_thinking', None)
     result_generator = VariableInterface.async_engine.generate(
         request.messages,
         request.session_id,
@@ -468,8 +477,7 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque
         sequence_end=True,
         do_preprocess=do_preprocess,
         adapter_name=adapter_name,
-        enable_thinking=request.enable_thinking,
-        add_vision_id=request.add_vision_id,
+        chat_template_kwargs=chat_template_kwargs or None,
         mm_processor_kwargs=request.mm_processor_kwargs)
 
     def create_stream_response_json(index: int,
@@ -543,8 +551,7 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
                 elif (request.tool_choice != 'none' and request.tools is not None
                       and VariableInterface.tool_parser is None):
                     logger.error('Please launch the api_server with --tool-call-parser if you want to use tool.')
-
-                if VariableInterface.reasoning_parser is not None and request.enable_thinking is not False:
+                if VariableInterface.reasoning_parser is not None and enable_thinking is not False:
                     reasoning_delta = VariableInterface.reasoning_parser.extract_reasoning_content_streaming(
                         previous_text=previous_text,
                         current_text=current_text,
@@ -617,7 +624,7 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
         elif request.tool_choice != 'none' and request.tools is not None and VariableInterface.tool_parser is None:
             logger.error('Please launch the api_server with --tool-call-parser if you want to use tool.')
 
-        if VariableInterface.reasoning_parser is not None and request.enable_thinking is not False:
+        if VariableInterface.reasoning_parser is not None and enable_thinking is not False:
             reasoning_content, text = VariableInterface.reasoning_parser.extract_reasoning_content(text, request)
 
         message = ChatMessage(role='assistant',
 
@@ -149,10 +149,14 @@ class ChatCompletionRequest(BaseModel):
     seed: Optional[int] = None
     min_new_tokens: Optional[int] = Field(default=None, examples=[None])
     min_p: float = 0.0
-    enable_thinking: Optional[bool] = None
-    add_vision_id: Optional[bool] = False
+    enable_thinking: Optional[bool] = None  # will be deprecated in the future
     return_token_ids: Optional[bool] = False
     include_stop_str_in_output: Optional[bool] = False
+    chat_template_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=('Additional keyword args to pass to the template renderer. '
+                     'Will be accessible by the chat template.'),
+    )
     # kwargs for hf processor
     mm_processor_kwargs: Optional[dict[str, Any]] = Field(
         default=None,
 
@@ -55,8 +55,7 @@ async def _get_prompt_input(self,
                                 sequence_start: bool,
                                 adapter_name: str,
                                 tools: Optional[List[object]] = None,
-                                enable_thinking: Optional[bool] = None,
-                                add_vision_id: Optional[bool] = False,
+                                chat_template_kwargs: Optional[Dict] = None,
                                 mm_processor_kwargs: Optional[Dict[str, Any]] = None,
                                 **kwargs):
         """Process messages and return the required data for the inference
@@ -71,8 +70,7 @@ async def _get_prompt_input(self,
                                                    sequence_start,
                                                    adapter_name,
                                                    tools=tools,
-                                                   enable_thinking=enable_thinking,
-                                                   add_vision_id=add_vision_id,
+                                                   chat_template_kwargs=chat_template_kwargs,
                                                    **kwargs)
         elif isinstance(messages, List):
             has_multimodal_input = any(
@@ -84,8 +82,7 @@ async def _get_prompt_input(self,
                                                        sequence_start,
                                                        adapter_name,
                                                        tools,
-                                                       enable_thinking=enable_thinking,
-                                                       add_vision_id=add_vision_id,
+                                                       chat_template_kwargs=chat_template_kwargs,
                                                        **kwargs)
         else:
             raise RuntimeError(f'unsupported messages {messages}')
@@ -105,8 +102,7 @@ async def _get_prompt_input(self,
                                                                self.tokenizer,
                                                                sequence_start,
                                                                tools=tools,
-                                                               enable_thinking=enable_thinking,
-                                                               add_vision_id=add_vision_id)
+                                                               chat_template_kwargs=chat_template_kwargs)
         elif self.backend == 'pytorch':
             # for pt engine, this module only conduct the image preprocessing
             # It leaves the vision embedding to the pt engine
@@ -115,8 +111,7 @@ async def _get_prompt_input(self,
                                                              self.tokenizer,
                                                              sequence_start,
                                                              tools=tools,
-                                                             enable_thinking=enable_thinking,
-                                                             add_vision_id=add_vision_id)
+                                                             chat_template_kwargs=chat_template_kwargs)
         return results
 
     @classmethod
 
@@ -80,8 +80,7 @@ async def wrap_for_pytorch(
         tokenizer,
         sequence_start,
         tools: Optional[List[object]] = None,
-        enable_thinking: Optional[bool] = None,
-        add_vision_id: Optional[bool] = False,
+        chat_template_kwargs: Optional[Dict] = None,
     ) -> List[Dict]:
         """
         Args:
@@ -106,8 +105,7 @@ async def wrap_for_pytorch(
                                            tokenizer,
                                            sequence_start,
                                            tools=tools,
-                                           enable_thinking=enable_thinking,
-                                           add_vision_id=add_vision_id)
+                                           chat_template_kwargs=chat_template_kwargs)
         else:
             result = self.model.to_pytorch_with_input_ids(messages)
         # clear data
@@ -123,8 +121,7 @@ async def wrap_for_turbomind(
         tokenizer,
         sequence_start,
         tools: Optional[List[object]] = None,
-        enable_thinking: Optional[bool] = None,
-        add_vision_id: Optional[bool] = False,
+        chat_template_kwargs: Optional[Dict] = None,
     ) -> Dict:
         """
         Args:
@@ -145,8 +142,7 @@ async def wrap_for_turbomind(
                                          tokenizer,
                                          sequence_start,
                                          tools=tools,
-                                         enable_thinking=enable_thinking,
-                                         add_vision_id=add_vision_id)
+                                         chat_template_kwargs=chat_template_kwargs)
         # clear data
         for i, message in enumerate(messages):
             if isinstance(message['content'], List):
 
@@ -132,7 +132,7 @@ def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]:
         if self.backend == 'turbomind':
             raise NotImplementedError()
 
-    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
+    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start, chat_template_kwargs=None, **kwargs):
         """Pack the preprocessing results in a format compatible with what is
         required by pytorch engine. ONLY implement it when the backend is
         pytorch engine.
@@ -142,11 +142,13 @@ def to_pytorch(self, messages, chat_template, tokenizer, sequence_start, **kwarg
             chat_template: the chat template defined in `lmdeploy/model.py`
             tokenzer: the tokenizer model
             sequence_start: starting flag of a sequence
+            chat_template_kwargs: additional arguments for chat template
+                processing, such as `add_vision_id` and `enable_thinking`
         """
         if self.backend == 'pytorch':
             raise NotImplementedError()
 
-    def to_turbomind(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
+    def to_turbomind(self, messages, chat_template, tokenizer, sequence_start, chat_template_kwargs=None, **kwargs):
         """Pack the forwarding results in a format compatible with what is
         required by turbomind engine. ONLY implement it when the backend is
         turbomind engine.
@@ -156,6 +158,8 @@ def to_turbomind(self, messages, chat_template, tokenizer, sequence_start, **kwa
             chat_template: the chat template defined in `lmdeploy/model.py`
             tokenzer: the tokenizer model
             sequence_start: starting flag of a sequence
+            chat_template_kwargs: additional arguments for chat template
+                processing, such as `add_vision_id` and `enable_thinking`
         """
         if self.backend == 'turbomind':
             raise NotImplementedError()