From 75ec504c85e22792e1b72e4195fb324865a32dbd Mon Sep 17 00:00:00 2001 From: R0CKSTAR Date: Tue, 6 Jan 2026 19:29:11 +0800 Subject: [PATCH 1/3] Support MThreads (MUSA) GPU (#1162) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR adds support for Moore Threads (MUSA) GPU platform, expanding LightLLM's hardware compatibility. *NOTE:* 1. `_fwd_kernel_token_att1` has been slightly updated to ensure compatibility with the Triton version. 2. `has_mtlink` will be used in upcoming enhancements to enable multi-GPU support. 3. `torch` / `torch_musa` need to be upgraded to the latest versions. ### Testing Done ```bash root@worker3218:/ws# python -m lightllm.server.api_server --model_dir /home/dist/Qwen3-0.6B/ --disable_cudagraph --host 0.0.0.0 WARNING 01-02 12:22:47 [sgl_utils.py:29] sgl_kernel is not installed, or the installed version did not support fa3. Try to upgrade it. WARNING 01-02 12:22:47 [light_utils.py:13] lightllm_kernel is not installed, you can't use the api of it. INFO 01-02 12:22:48 [__init__.py:36] Available plugins for group vllm.platform_plugins: INFO 01-02 12:22:48 [__init__.py:38] - musa -> vllm_musa:register INFO 01-02 12:22:48 [__init__.py:41] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. INFO 01-02 12:22:48 [__init__.py:232] Platform plugin musa is activated WARNING 01-02 12:22:48 [vllm_utils.py:18] vllm is not installed, you can't use the api of it. You can solve it by running `pip install vllm`. INFO 01-02 12:22:48 [communication_op.py:57] deep_ep is not installed, you can't use the api of it. INFO 01-02 12:22:48 [cache_tensor_manager.py:17] USE_GPU_TENSOR_CACHE is On WARNING 01-02 12:22:48 [grouped_fused_moe_ep.py:28] no deepep or deep_gemm WARNING 01-02 12:22:48 [nixl_kv_transporter.py:19] nixl is not installed, which is required for pd disagreggation!!! INFO 01-02 12:22:48 [shm_size_check.py:21] SHM check: Available=500.00 GB,Recommended=2.32 GB.Sufficient: True INFO 01-02 12:22:48 [api_start.py:94] zmq mode head: ipc:///tmp/_28765_0_ INFO 01-02 12:22:48 [api_start.py:96] use tgi api: False INFO 01-02 12:22:48 [api_start.py:233] alloced ports: [10105, 10128, 10009, 10002, 10268, 10173, 10255, 10190, 10225, 10305] INFO 01-02 12:22:48 [api_start.py:284] all start args:Namespace(run_mode='normal', host='0.0.0.0', port=8000, httpserver_workers=1, zmq_mode='ipc:///tmp/_28765_0_', pd_master_ip='0.0.0.0', pd_master_port=1212, pd_decode_rpyc_port=42000, select_p_d_node_strategy='round_robin', config_server_host=None, config_server_port=None, nixl_pd_kv_page_num=16, nixl_pd_kv_page_size=1024, model_name='default_model_name', model_dir='/home/dist/Qwen3-0.6B/', tokenizer_mode='fast', load_way='HF', max_total_token_num=None, mem_fraction=0.9, batch_max_tokens=8448, eos_id=[151645], tool_call_parser=None, reasoning_parser=None, chat_template=None, running_max_req_size=1000, nnodes=1, node_rank=0, multinode_httpmanager_port=12345, multinode_router_gloo_port=20001, tp=1, dp=1, dp_balancer='bs_balancer', max_req_total_len=16384, nccl_host='127.0.0.1', nccl_port=28765, use_config_server_to_init_nccl=False, mode=[], trust_remote_code=False, disable_log_stats=False, log_stats_interval=10, disable_shm_warning=False, router_token_ratio=0.0, router_max_new_token_len=1024, router_max_wait_tokens=1, disable_aggressive_schedule=False, use_dynamic_prompt_cache=False, disable_dynamic_prompt_cache=False, chunked_prefill_size=4096, disable_chunked_prefill=False, diverse_mode=False, token_healing_mode=False, output_constraint_mode='none', first_token_constraint_mode=False, enable_multimodal=False, enable_multimodal_audio=False, enable_mps=False, disable_custom_allreduce=False, enable_custom_allgather=False, enable_tpsp_mix_mode=False, enable_dp_prefill_balance=False, enable_prefill_microbatch_overlap=False, enable_decode_microbatch_overlap=False, enable_flashinfer_prefill=False, enable_flashinfer_decode=False, enable_fa3=False, cache_capacity=200, embed_cache_storage_size=4, data_type='bfloat16', return_all_prompt_logprobs=False, use_reward_model=False, long_truncation_mode=None, use_tgi_api=False, health_monitor=False, metric_gateway=None, job_name='lightllm', grouping_key=[], push_interval=10, visual_infer_batch_size=1, visual_send_batch_size=1, visual_gpu_ids=[0], visual_tp=1, visual_dp=1, visual_nccl_ports=[29500], enable_monitor_auth=False, disable_cudagraph=True, enable_prefill_cudagraph=False, prefll_cudagraph_max_handle_token=512, graph_max_batch_size=256, graph_split_batch_size=32, graph_grow_step_size=16, graph_max_len_in_batch=16384, quant_type='none', quant_cfg=None, vit_quant_type='none', vit_quant_cfg=None, sampling_backend='triton', penalty_counter_mode='gpu_counter', ep_redundancy_expert_config_path=None, auto_update_redundancy_expert=False, enable_fused_shared_experts=False, mtp_mode=None, mtp_draft_model_dir=None, mtp_step=0, kv_quant_calibration_config_path=None, schedule_time_interval=0.03, enable_cpu_cache=False, cpu_cache_storage_size=2, cpu_cache_token_page_size=256, enable_disk_cache=False, disk_cache_storage_size=10, disk_cache_dir=None, enable_dp_prompt_cache_fetch=False, router_port=10105, detokenization_port=10128, http_server_port=10009, visual_port=10002, audio_port=10268, cache_port=10173, metric_port=10255, multi_level_kv_cache_port=10190, pd_node_infer_rpyc_ports=[10305], pd_node_id=294623010895931863621527973304373176200, pd_p_allowed_port_min=20000, pd_p_allowed_port_max=30000) WARNING 01-02 12:22:55 [sgl_utils.py:29] sgl_kernel is not installed, or the installed version did not support fa3. Try to upgrade it. WARNING 01-02 12:22:55 [light_utils.py:13] lightllm_kernel is not installed, you can't use the api of it. INFO 01-02 12:22:55 [__init__.py:36] Available plugins for group vllm.platform_plugins: INFO 01-02 12:22:55 [__init__.py:38] - musa -> vllm_musa:register INFO 01-02 12:22:55 [__init__.py:41] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. INFO 01-02 12:22:55 [__init__.py:232] Platform plugin musa is activated WARNING 01-02 12:22:55 [vllm_utils.py:18] vllm is not installed, you can't use the api of it. You can solve it by running `pip install vllm`. INFO 01-02 12:22:55 [communication_op.py:57] deep_ep is not installed, you can't use the api of it. 2026-01-02 12:22:55 | server | 140684395422848 | INFO : server started on [0.0.0.0]:10255 INFO 01-02 12:22:55 [start_utils.py:37] init func start_metric_manager : init ok WARNING 01-02 12:23:02 [sgl_utils.py:29] sgl_kernel is not installed, or the installed version did not support fa3. Try to upgrade it. WARNING 01-02 12:23:02 [light_utils.py:13] lightllm_kernel is not installed, you can't use the api of it. WARNING 01-02 12:23:02 [sgl_utils.py:29] sgl_kernel is not installed, or the installed version did not support fa3. Try to upgrade it. WARNING 01-02 12:23:02 [light_utils.py:13] lightllm_kernel is not installed, you can't use the api of it. INFO 01-02 12:23:02 [__init__.py:36] Available plugins for group vllm.platform_plugins: INFO 01-02 12:23:02 [__init__.py:38] - musa -> vllm_musa:register INFO 01-02 12:23:02 [__init__.py:41] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. INFO 01-02 12:23:02 [__init__.py:232] Platform plugin musa is activated WARNING 01-02 12:23:02 [vllm_utils.py:18] vllm is not installed, you can't use the api of it. You can solve it by running `pip install vllm`. INFO 01-02 12:23:02 [communication_op.py:57] deep_ep is not installed, you can't use the api of it. INFO 01-02 12:23:02 [cache_tensor_manager.py:17] USE_GPU_TENSOR_CACHE is On INFO 01-02 12:23:02 [__init__.py:36] Available plugins for group vllm.platform_plugins: INFO 01-02 12:23:02 [__init__.py:38] - musa -> vllm_musa:register INFO 01-02 12:23:02 [__init__.py:41] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. INFO 01-02 12:23:02 [__init__.py:232] Platform plugin musa is activated WARNING 01-02 12:23:02 [vllm_utils.py:18] vllm is not installed, you can't use the api of it. You can solve it by running `pip install vllm`. INFO 01-02 12:23:02 [communication_op.py:57] deep_ep is not installed, you can't use the api of it. WARNING 01-02 12:23:02 [grouped_fused_moe_ep.py:28] no deepep or deep_gemm INFO 01-02 12:23:02 [cache_tensor_manager.py:17] USE_GPU_TENSOR_CACHE is On WARNING 01-02 12:23:03 [grouped_fused_moe_ep.py:28] no deepep or deep_gemm INFO 01-02 12:23:03 [manager.py:36] pub_to_httpserver sendhwm 1000 WARNING 01-02 12:23:03 [nixl_kv_transporter.py:19] nixl is not installed, which is required for pd disagreggation!!! 2026-01-02 12:23:03 | server | 140684395422848 | INFO : accepted ('127.0.0.1', 36414) with fd 25 2026-01-02 12:23:03 | server | 140653235951168 | INFO : welcome ('127.0.0.1', 36414) INFO 01-02 12:23:08 [cache_tensor_manager.py:17] USE_GPU_TENSOR_CACHE is On WARNING 01-02 12:23:09 [sgl_utils.py:29] sgl_kernel is not installed, or the installed version did not support fa3. Try to upgrade it. INFO 01-02 12:23:10 [__init__.py:36] Available plugins for group vllm.platform_plugins: INFO 01-02 12:23:10 [__init__.py:38] - musa -> vllm_musa:register INFO 01-02 12:23:10 [__init__.py:41] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. INFO 01-02 12:23:10 [__init__.py:232] Platform plugin musa is activated WARNING 01-02 12:23:10 [vllm_utils.py:18] vllm is not installed, you can't use the api of it. You can solve it by running `pip install vllm`. WARNING 01-02 12:23:10 [light_utils.py:13] lightllm_kernel is not installed, you can't use the api of it. WARNING 01-02 12:23:10 [grouped_fused_moe_ep.py:28] no deepep or deep_gemm INFO 01-02 12:23:10 [communication_op.py:57] deep_ep is not installed, you can't use the api of it. WARNING 01-02 12:23:10 [nixl_kv_transporter.py:19] nixl is not installed, which is required for pd disagreggation!!! INFO 01-02 12:23:10 [model_rpc.py:67] Initialized RPC server for rank 0. INFO 01-02 12:23:10 [model_rpc.py:168] use ChunkedPrefillBackend INFO 01-02 12:23:11 [basemodel.py:157] Initial quantization. The default quantization method is none pid 39235 Loading model weights with 1 workers: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1.01it/s] INFO 01-02 12:23:12 [mem_utils.py:37] mode setting params: [] INFO 01-02 12:23:12 [mem_utils.py:57] Model kv cache using mode normal INFO 01-02 12:23:12 [mem_manager.py:84] 69.38735313415528 GB space is available after load the model weight INFO 01-02 12:23:12 [mem_manager.py:84] 0.109375 MB is the size of one token kv cache INFO 01-02 12:23:12 [mem_manager.py:84] 649624 is the profiled max_total_token_num with the mem_fraction 0.9 INFO 01-02 12:23:12 [mem_manager.py:84] warming up: 0%| | 0/12 [00:00 INFO 01-02 12:23:45 [manager.py:196] use req queue ChunkedPrefillQueue INFO 01-02 12:23:45 [start_utils.py:37] init func start_router_process : init ok INFO 01-02 12:23:45 [start_utils.py:37] init func start_detokenization_process : init ok INFO 01-02 12:23:45 [api_start.py:58] start process pid 30307 INFO 01-02 12:23:45 [api_start.py:59] http server pid 54746 [2026-01-02 12:23:45 +0800] [54746] [INFO] Starting gunicorn 23.0.0 [2026-01-02 12:23:45 +0800] [54746] [INFO] Listening at: http://0.0.0.0:8000 (54746) [2026-01-02 12:23:45 +0800] [54746] [INFO] Using worker: uvicorn.workers.UvicornWorker [2026-01-02 12:23:45 +0800] [54966] [INFO] Booting worker with pid: 54966 WARNING 01-02 12:23:51 [sgl_utils.py:29] sgl_kernel is not installed, or the installed version did not support fa3. Try to upgrade it. WARNING 01-02 12:23:51 [light_utils.py:13] lightllm_kernel is not installed, you can't use the api of it. INFO 01-02 12:23:52 [__init__.py:36] Available plugins for group vllm.platform_plugins: INFO 01-02 12:23:52 [__init__.py:38] - musa -> vllm_musa:register INFO 01-02 12:23:52 [__init__.py:41] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. INFO 01-02 12:23:52 [__init__.py:232] Platform plugin musa is activated WARNING 01-02 12:23:52 [vllm_utils.py:18] vllm is not installed, you can't use the api of it. You can solve it by running `pip install vllm`. INFO 01-02 12:23:52 [communication_op.py:57] deep_ep is not installed, you can't use the api of it. INFO 01-02 12:23:52 [cache_tensor_manager.py:17] USE_GPU_TENSOR_CACHE is On WARNING 01-02 12:23:52 [grouped_fused_moe_ep.py:28] no deepep or deep_gemm [2026-01-02 12:23:52 +0800] [54966] [INFO] Started server process [54966] [2026-01-02 12:23:52 +0800] [54966] [INFO] Waiting for application startup. INFO 01-02 12:23:52 [api_http.py:359] server start up 2026-01-02 12:23:53 | server | 140684395422848 | INFO : accepted ('127.0.0.1', 55128) with fd 26 2026-01-02 12:23:53 | server | 140653227558464 | INFO : welcome ('127.0.0.1', 55128) 2026-01-02 12:23:53 | server | 140684395422848 | INFO : accepted ('127.0.0.1', 55144) with fd 27 2026-01-02 12:23:53 | server | 140653219165760 | INFO : welcome ('127.0.0.1', 55144) INFO 01-02 12:23:54 [req_id_generator.py:34] ReqIDGenerator init finished INFO 01-02 12:23:54 [api_http.py:363] server start up ok, loop use is [2026-01-02 12:23:54 +0800] [54966] [INFO] Application startup complete. INFO 01-02 12:23:58 [manager.py:417] recieved req X-Request-Id: X-Session-Id: start_time:2026-01-02 12:23:58 lightllm_req_id:8 INFO 01-02 12:23:58 [manager.py:424] router recive req id 8 cost time 0.05271601676940918 s DEBUG 01-02 12:23:58 [manager.py:322] Prefill Batch: batch_id=-1, time:1767327838.6764812s req_ids:[8] DEBUG 01-02 12:23:58 [manager.py:322] INFO 01-02 12:23:58 [manager.py:55] detokenization recv req id 8 cost time 0.0744318962097168 s INFO 01-02 12:23:59 [manager.py:163] detoken release req id 8 INFO 01-02 12:23:59 [manager.py:611] X-Request-Id: X-Session-Id: start_time:2026-01-02 12:23:58 lightllm_req_id:8 first_token_cost:409.63053703308105ms total_cost_time:907.1474075317383ms,out_token_counter:17 mean_per_token_cost_time: 29.265698264626895ms prompt_token_num:4 gpu cache hit: False gpu_prompt_cache_len:0 gpu_prompt_cache_ratio:0.0 cpu cache hit: False cpu_prompt_cache_len:0 cpu_prompt_cache_ratio:0.0 disk cache hit: False disk_prompt_cache_len:0 disk_prompt_cache_ratio:0.0 mtp_avg_token_per_step:1.0 127.0.0.1:38158 - "POST /generate HTTP/1.1" 200 DEBUG 01-02 12:23:59 [req_manager.py:78] freed all request size 1008 DEBUG 01-02 12:23:59 [infer_batch.py:172] free a batch state: DEBUG 01-02 12:23:59 [infer_batch.py:172] radix refed token num 0 DEBUG 01-02 12:23:59 [infer_batch.py:172] radix hold token num 21 DEBUG 01-02 12:23:59 [infer_batch.py:172] mem manager can alloc token num 649603 DEBUG 01-02 12:23:59 [infer_batch.py:172] mem manager total size 649624 INFO 01-02 12:23:59 [batch.py:56] router release req id 8 INFO 01-02 12:23:59 [shm_req_manager.py:111] all shm req has been release ok ``` Signed-off-by: Xiaodong Ye --- lightllm/__init__.py | 4 +++ .../token_attention_nopad_att1.py | 3 +- lightllm/utils/device_utils.py | 31 +++++++++++++++---- 3 files changed, 31 insertions(+), 7 deletions(-) diff --git a/lightllm/__init__.py b/lightllm/__init__.py index e69de29bb..e9ba6f304 100644 --- a/lightllm/__init__.py +++ b/lightllm/__init__.py @@ -0,0 +1,4 @@ +from lightllm.utils.device_utils import is_musa + +if is_musa(): + import torchada # noqa: F401 diff --git a/lightllm/models/llama/triton_kernel/token_attention_nopad_att1.py b/lightllm/models/llama/triton_kernel/token_attention_nopad_att1.py index eb5af6fec..45de83e98 100644 --- a/lightllm/models/llama/triton_kernel/token_attention_nopad_att1.py +++ b/lightllm/models/llama/triton_kernel/token_attention_nopad_att1.py @@ -60,7 +60,8 @@ def _fwd_kernel_token_att1( ).to(tl.int64) off_k = k_loc[:, None] * stride_kbs + cur_kv_head * stride_kh + offs_d[None, :] * stride_kd k = tl.load(K + off_k, mask=offs_n_new[:, None] < cur_batch_end_index, other=0.0) - att_value = tl.sum(q[None, :] * k, 1, dtype=tl.float32) + att_value = tl.sum(q[None, :] * k, 1) + att_value = att_value.to(tl.float32) att_value *= sm_scale off_o = cur_head * att_stride_h + (cur_batch_in_all_start_index + offs_n) * att_stride_bs tl.store(Att_Out + off_o, att_value, mask=offs_n_new < cur_batch_end_index) diff --git a/lightllm/utils/device_utils.py b/lightllm/utils/device_utils.py index cd48a355b..09d7a680f 100644 --- a/lightllm/utils/device_utils.py +++ b/lightllm/utils/device_utils.py @@ -81,11 +81,14 @@ def calcu_kernel_best_vsm_count(kernel, num_warps): return num_sm +@lru_cache(maxsize=1) +def is_musa(): + return hasattr(torch.version, "musa") and torch.version.musa is not None + + @lru_cache(maxsize=None) def get_current_device_name(): - import torch - - if torch.cuda.is_available(): + if torch.cuda.is_available() or is_musa(): device = torch.cuda.current_device() gpu_name = torch.cuda.get_device_name(device) # 4090 trans to 4090 D @@ -103,8 +106,6 @@ def init_p2p(device_index): """ torch 调用跨卡的to操作后,triton编译的算子便能自动操作跨卡tensor。 """ - import torch - num_gpus = torch.cuda.device_count() tensor = torch.zeros((1,)) tensor = tensor.to(f"cuda:{device_index}") @@ -127,8 +128,26 @@ def has_nvlink(): result = result.decode("utf-8") # Check if the output contains 'NVLink' return any(f"NV{i}" in result for i in range(1, 8)) + except FileNotFoundError: + # nvidia-smi is not installed, assume no NVLink + return False + except subprocess.CalledProcessError: + # If there's an error while executing nvidia-smi, assume no NVLink + return False + + +def has_mtlink(): + try: + # Call mthreads-gmi to get the topology matrix + result = subprocess.check_output(["mthreads-gmi", "topo", "--matrix"]) + result = result.decode("utf-8") + # Check if the output contains 'MTLink' + return any(f"MT{i}" in result for i in range(1, 8)) + except FileNotFoundError: + # mthreads-gmi is not installed, assume no MTLink + return False except subprocess.CalledProcessError: - # If there's an error (e.g., nvidia-smi is not installed or another issue), assume no NVLink + # If there's an error while executing mthreads-gmi, assume no MTLink return False From e96b3d440d7fac4405b69d9f39a9ee4851ee254c Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Thu, 8 Jan 2026 11:21:58 +0000 Subject: [PATCH 2/3] update openai_api --- lightllm/server/api_models.py | 16 +++++++++++++--- lightllm/server/api_openai.py | 3 ++- lightllm/server/build_prompt.py | 6 +++++- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/lightllm/server/api_models.py b/lightllm/server/api_models.py index 7b9cdd501..df684d4dd 100644 --- a/lightllm/server/api_models.py +++ b/lightllm/server/api_models.py @@ -1,7 +1,7 @@ import time import uuid -from pydantic import BaseModel, Field, field_validator, model_validator +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator from typing import Any, Dict, List, Optional, Union, Literal, ClassVar from transformers import GenerationConfig @@ -21,6 +21,14 @@ class Message(BaseModel): content: Union[str, List[MessageContent]] +class CharacterMessage(BaseModel): + """Message format for character-based chat, where role is inferred from name.""" + + name: str + content: Union[str, List[MessageContent]] + role: Optional[str] = None # Optional, can be inferred from role_setting + + class Function(BaseModel): """Function descriptions.""" @@ -105,7 +113,7 @@ def _normalize_role(cls, v): raise ValueError("'role' must be a string") -ChatCompletionMessageParam = Union[ChatCompletionMessageGenericParam, Message] +ChatCompletionMessageParam = Union[ChatCompletionMessageGenericParam, Message, CharacterMessage] class CompletionRequest(BaseModel): @@ -176,6 +184,8 @@ def apply_loaded_defaults(cls, data: Any): class ChatCompletionRequest(BaseModel): + model_config = ConfigDict(populate_by_name=True) + model: str messages: List[ChatCompletionMessageParam] function_call: Optional[str] = "none" @@ -216,7 +226,7 @@ class ChatCompletionRequest(BaseModel): top_k: Optional[int] = -1 repetition_penalty: Optional[float] = 1.0 ignore_eos: Optional[bool] = False - role_settings: Optional[Dict[str, str]] = None + role_settings: Optional[Dict[str, str]] = Field(default=None, alias="role_setting") character_settings: Optional[List[Dict[str, str]]] = None # Class variables to store loaded default values diff --git a/lightllm/server/api_openai.py b/lightllm/server/api_openai.py index 6a8c232dc..cdac4ab5c 100644 --- a/lightllm/server/api_openai.py +++ b/lightllm/server/api_openai.py @@ -105,7 +105,8 @@ def _get_history_tool_calls_cnt(request: ChatCompletionRequest) -> int: messages = getattr(request, "messages", []) idx = 0 for msg in messages: - if msg.role == "assistant": + role = getattr(msg, "role", None) + if role == "assistant": tool_calls = getattr(msg, "tool_calls", None) idx += len(list(tool_calls)) if tool_calls is not None else 0 # noqa return idx diff --git a/lightllm/server/build_prompt.py b/lightllm/server/build_prompt.py index f770459a5..131ee76b4 100644 --- a/lightllm/server/build_prompt.py +++ b/lightllm/server/build_prompt.py @@ -17,7 +17,11 @@ async def build_prompt(request, tools) -> str: global tokenizer # pydantic格式转成dict, 否则,当根据tokenizer_config.json拼template时,Jinja判断无法识别 messages = [m.model_dump(by_alias=True, exclude_none=True) for m in request.messages] - kwargs = {"conversation": messages} + kwargs = { + "conversation": messages, + # 假设 request 对象里有这个字段,或者你想传空 + "system_instruction": getattr(request, "system_instruction", ""), + } if request.character_settings: kwargs["character_settings"] = request.character_settings if request.role_settings: From 40382c89dd86a55b9d56ea212afce9e64203f793 Mon Sep 17 00:00:00 2001 From: WANDY666 <1060304770@qq.com> Date: Thu, 8 Jan 2026 11:58:40 +0000 Subject: [PATCH 3/3] fix --- lightllm/server/api_models.py | 1 + lightllm/server/build_prompt.py | 7 ++----- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/lightllm/server/api_models.py b/lightllm/server/api_models.py index df684d4dd..de6d6ba20 100644 --- a/lightllm/server/api_models.py +++ b/lightllm/server/api_models.py @@ -228,6 +228,7 @@ class ChatCompletionRequest(BaseModel): ignore_eos: Optional[bool] = False role_settings: Optional[Dict[str, str]] = Field(default=None, alias="role_setting") character_settings: Optional[List[Dict[str, str]]] = None + system_instruction: Optional[str] = None # Class variables to store loaded default values _loaded_defaults: ClassVar[Dict[str, Any]] = {} diff --git a/lightllm/server/build_prompt.py b/lightllm/server/build_prompt.py index 131ee76b4..cff2ec127 100644 --- a/lightllm/server/build_prompt.py +++ b/lightllm/server/build_prompt.py @@ -17,15 +17,12 @@ async def build_prompt(request, tools) -> str: global tokenizer # pydantic格式转成dict, 否则,当根据tokenizer_config.json拼template时,Jinja判断无法识别 messages = [m.model_dump(by_alias=True, exclude_none=True) for m in request.messages] - kwargs = { - "conversation": messages, - # 假设 request 对象里有这个字段,或者你想传空 - "system_instruction": getattr(request, "system_instruction", ""), - } + kwargs = {"conversation": messages} if request.character_settings: kwargs["character_settings"] = request.character_settings if request.role_settings: kwargs["role_setting"] = request.role_settings + kwargs["system_instruction"] = request.system_instruction if request.chat_template_kwargs: kwargs.update(request.chat_template_kwargs)