From 75ec504c85e22792e1b72e4195fb324865a32dbd Mon Sep 17 00:00:00 2001
From: R0CKSTAR <yeahdongcn@gmail.com>
Date: Tue, 6 Jan 2026 19:29:11 +0800
Subject: [PATCH 1/3] Support MThreads (MUSA) GPU (#1162)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR adds support for Moore Threads (MUSA) GPU platform, expanding
LightLLM's hardware compatibility.

*NOTE:*

1. `_fwd_kernel_token_att1` has been slightly updated to ensure
compatibility with the Triton version.
2. `has_mtlink` will be used in upcoming enhancements to enable
multi-GPU support.
3. `torch` / `torch_musa` need to be upgraded to the latest versions.

### Testing Done

```bash
root@worker3218:/ws# python -m lightllm.server.api_server --model_dir /home/dist/Qwen3-0.6B/ --disable_cudagraph --host 0.0.0.0
WARNING 01-02 12:22:47 [sgl_utils.py:29] sgl_kernel is not installed, or the installed version did not support fa3.         Try to upgrade it.
WARNING 01-02 12:22:47 [light_utils.py:13] lightllm_kernel is not installed, you can't use the api of it.
INFO 01-02 12:22:48 [__init__.py:36] Available plugins for group vllm.platform_plugins:
INFO 01-02 12:22:48 [__init__.py:38] - musa -> vllm_musa:register
INFO 01-02 12:22:48 [__init__.py:41] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load.
INFO 01-02 12:22:48 [__init__.py:232] Platform plugin musa is activated
WARNING 01-02 12:22:48 [vllm_utils.py:18] vllm is not installed, you can't use the api of it.                    You can solve it by running `pip install vllm`.
INFO 01-02 12:22:48 [communication_op.py:57] deep_ep is not installed, you can't use the api of it.
INFO 01-02 12:22:48 [cache_tensor_manager.py:17] USE_GPU_TENSOR_CACHE is On
WARNING 01-02 12:22:48 [grouped_fused_moe_ep.py:28] no deepep or deep_gemm
WARNING 01-02 12:22:48 [nixl_kv_transporter.py:19] nixl is not installed, which is required for pd disagreggation!!!
INFO 01-02 12:22:48 [shm_size_check.py:21] SHM check: Available=500.00 GB,Recommended=2.32 GB.Sufficient: True
INFO 01-02 12:22:48 [api_start.py:94] zmq mode head: ipc:///tmp/_28765_0_
INFO 01-02 12:22:48 [api_start.py:96] use tgi api: False
INFO 01-02 12:22:48 [api_start.py:233] alloced ports: [10105, 10128, 10009, 10002, 10268, 10173, 10255, 10190, 10225, 10305]
INFO 01-02 12:22:48 [api_start.py:284] all start args:Namespace(run_mode='normal', host='0.0.0.0', port=8000, httpserver_workers=1, zmq_mode='ipc:///tmp/_28765_0_', pd_master_ip='0.0.0.0', pd_master_port=1212, pd_decode_rpyc_port=42000, select_p_d_node_strategy='round_robin', config_server_host=None, config_server_port=None, nixl_pd_kv_page_num=16, nixl_pd_kv_page_size=1024, model_name='default_model_name', model_dir='/home/dist/Qwen3-0.6B/', tokenizer_mode='fast', load_way='HF', max_total_token_num=None, mem_fraction=0.9, batch_max_tokens=8448, eos_id=[151645], tool_call_parser=None, reasoning_parser=None, chat_template=None, running_max_req_size=1000, nnodes=1, node_rank=0, multinode_httpmanager_port=12345, multinode_router_gloo_port=20001, tp=1, dp=1, dp_balancer='bs_balancer', max_req_total_len=16384, nccl_host='127.0.0.1', nccl_port=28765, use_config_server_to_init_nccl=False, mode=[], trust_remote_code=False, disable_log_stats=False, log_stats_interval=10, disable_shm_warning=False, router_token_ratio=0.0, router_max_new_token_len=1024, router_max_wait_tokens=1, disable_aggressive_schedule=False, use_dynamic_prompt_cache=False, disable_dynamic_prompt_cache=False, chunked_prefill_size=4096, disable_chunked_prefill=False, diverse_mode=False, token_healing_mode=False, output_constraint_mode='none', first_token_constraint_mode=False, enable_multimodal=False, enable_multimodal_audio=False, enable_mps=False, disable_custom_allreduce=False, enable_custom_allgather=False, enable_tpsp_mix_mode=False, enable_dp_prefill_balance=False, enable_prefill_microbatch_overlap=False, enable_decode_microbatch_overlap=False, enable_flashinfer_prefill=False, enable_flashinfer_decode=False, enable_fa3=False, cache_capacity=200, embed_cache_storage_size=4, data_type='bfloat16', return_all_prompt_logprobs=False, use_reward_model=False, long_truncation_mode=None, use_tgi_api=False, health_monitor=False, metric_gateway=None, job_name='lightllm', grouping_key=[], push_interval=10, visual_infer_batch_size=1, visual_send_batch_size=1, visual_gpu_ids=[0], visual_tp=1, visual_dp=1, visual_nccl_ports=[29500], enable_monitor_auth=False, disable_cudagraph=True, enable_prefill_cudagraph=False, prefll_cudagraph_max_handle_token=512, graph_max_batch_size=256, graph_split_batch_size=32, graph_grow_step_size=16, graph_max_len_in_batch=16384, quant_type='none', quant_cfg=None, vit_quant_type='none', vit_quant_cfg=None, sampling_backend='triton', penalty_counter_mode='gpu_counter', ep_redundancy_expert_config_path=None, auto_update_redundancy_expert=False, enable_fused_shared_experts=False, mtp_mode=None, mtp_draft_model_dir=None, mtp_step=0, kv_quant_calibration_config_path=None, schedule_time_interval=0.03, enable_cpu_cache=False, cpu_cache_storage_size=2, cpu_cache_token_page_size=256, enable_disk_cache=False, disk_cache_storage_size=10, disk_cache_dir=None, enable_dp_prompt_cache_fetch=False, router_port=10105, detokenization_port=10128, http_server_port=10009, visual_port=10002, audio_port=10268, cache_port=10173, metric_port=10255, multi_level_kv_cache_port=10190, pd_node_infer_rpyc_ports=[10305], pd_node_id=294623010895931863621527973304373176200, pd_p_allowed_port_min=20000, pd_p_allowed_port_max=30000)
WARNING 01-02 12:22:55 [sgl_utils.py:29] sgl_kernel is not installed, or the installed version did not support fa3.         Try to upgrade it.
WARNING 01-02 12:22:55 [light_utils.py:13] lightllm_kernel is not installed, you can't use the api of it.
INFO 01-02 12:22:55 [__init__.py:36] Available plugins for group vllm.platform_plugins:
INFO 01-02 12:22:55 [__init__.py:38] - musa -> vllm_musa:register
INFO 01-02 12:22:55 [__init__.py:41] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load.
INFO 01-02 12:22:55 [__init__.py:232] Platform plugin musa is activated
WARNING 01-02 12:22:55 [vllm_utils.py:18] vllm is not installed, you can't use the api of it.                    You can solve it by running `pip install vllm`.
INFO 01-02 12:22:55 [communication_op.py:57] deep_ep is not installed, you can't use the api of it.
2026-01-02 12:22:55 | server | 140684395422848 | INFO : server started on [0.0.0.0]:10255
INFO 01-02 12:22:55 [start_utils.py:37] init func start_metric_manager : init ok
WARNING 01-02 12:23:02 [sgl_utils.py:29] sgl_kernel is not installed, or the installed version did not support fa3.         Try to upgrade it.
WARNING 01-02 12:23:02 [light_utils.py:13] lightllm_kernel is not installed, you can't use the api of it.
WARNING 01-02 12:23:02 [sgl_utils.py:29] sgl_kernel is not installed, or the installed version did not support fa3.         Try to upgrade it.
WARNING 01-02 12:23:02 [light_utils.py:13] lightllm_kernel is not installed, you can't use the api of it.
INFO 01-02 12:23:02 [__init__.py:36] Available plugins for group vllm.platform_plugins:
INFO 01-02 12:23:02 [__init__.py:38] - musa -> vllm_musa:register
INFO 01-02 12:23:02 [__init__.py:41] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load.
INFO 01-02 12:23:02 [__init__.py:232] Platform plugin musa is activated
WARNING 01-02 12:23:02 [vllm_utils.py:18] vllm is not installed, you can't use the api of it.                    You can solve it by running `pip install vllm`.
INFO 01-02 12:23:02 [communication_op.py:57] deep_ep is not installed, you can't use the api of it.
INFO 01-02 12:23:02 [cache_tensor_manager.py:17] USE_GPU_TENSOR_CACHE is On
INFO 01-02 12:23:02 [__init__.py:36] Available plugins for group vllm.platform_plugins:
INFO 01-02 12:23:02 [__init__.py:38] - musa -> vllm_musa:register
INFO 01-02 12:23:02 [__init__.py:41] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load.
INFO 01-02 12:23:02 [__init__.py:232] Platform plugin musa is activated
WARNING 01-02 12:23:02 [vllm_utils.py:18] vllm is not installed, you can't use the api of it.                    You can solve it by running `pip install vllm`.
INFO 01-02 12:23:02 [communication_op.py:57] deep_ep is not installed, you can't use the api of it.
WARNING 01-02 12:23:02 [grouped_fused_moe_ep.py:28] no deepep or deep_gemm
INFO 01-02 12:23:02 [cache_tensor_manager.py:17] USE_GPU_TENSOR_CACHE is On
WARNING 01-02 12:23:03 [grouped_fused_moe_ep.py:28] no deepep or deep_gemm
INFO 01-02 12:23:03 [manager.py:36] pub_to_httpserver sendhwm 1000
WARNING 01-02 12:23:03 [nixl_kv_transporter.py:19] nixl is not installed, which is required for pd disagreggation!!!
2026-01-02 12:23:03 | server | 140684395422848 | INFO : accepted ('127.0.0.1', 36414) with fd 25
2026-01-02 12:23:03 | server | 140653235951168 | INFO : welcome ('127.0.0.1', 36414)
INFO 01-02 12:23:08 [cache_tensor_manager.py:17] USE_GPU_TENSOR_CACHE is On
WARNING 01-02 12:23:09 [sgl_utils.py:29] sgl_kernel is not installed, or the installed version did not support fa3.         Try to upgrade it.
INFO 01-02 12:23:10 [__init__.py:36] Available plugins for group vllm.platform_plugins:
INFO 01-02 12:23:10 [__init__.py:38] - musa -> vllm_musa:register
INFO 01-02 12:23:10 [__init__.py:41] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load.
INFO 01-02 12:23:10 [__init__.py:232] Platform plugin musa is activated
WARNING 01-02 12:23:10 [vllm_utils.py:18] vllm is not installed, you can't use the api of it.                    You can solve it by running `pip install vllm`.
WARNING 01-02 12:23:10 [light_utils.py:13] lightllm_kernel is not installed, you can't use the api of it.
WARNING 01-02 12:23:10 [grouped_fused_moe_ep.py:28] no deepep or deep_gemm
INFO 01-02 12:23:10 [communication_op.py:57] deep_ep is not installed, you can't use the api of it.
WARNING 01-02 12:23:10 [nixl_kv_transporter.py:19] nixl is not installed, which is required for pd disagreggation!!!
INFO 01-02 12:23:10 [model_rpc.py:67] Initialized RPC server for rank 0.
INFO 01-02 12:23:10 [model_rpc.py:168] use ChunkedPrefillBackend
INFO 01-02 12:23:11 [basemodel.py:157] Initial quantization. The default quantization method is none
pid 39235 Loading model weights with 1 workers: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.01it/s]
INFO 01-02 12:23:12 [mem_utils.py:37] mode setting params: []
INFO 01-02 12:23:12 [mem_utils.py:57] Model kv cache using mode normal
INFO 01-02 12:23:12 [mem_manager.py:84] 69.38735313415528 GB space is available after load the model weight
INFO 01-02 12:23:12 [mem_manager.py:84] 0.109375 MB is the size of one token kv cache
INFO 01-02 12:23:12 [mem_manager.py:84] 649624 is the profiled max_total_token_num with the mem_fraction 0.9
INFO 01-02 12:23:12 [mem_manager.py:84]
warming up:   0%|                                                                                                                                                                  | 0/12 [00:00<?, ?it/s]WARNING 01-02 12:23:23 [autotuner.py:169] No kernel config for silu_and_mul_fwd:v1 in {N=3072,out_dtype=torch.bfloat16}_MTT_S5000.json,the performance may be suboptimal!You can use LIGHTLLM_TRITON_AUTOTUNE_LEVEL=1 to enable autotune.
WARNING 01-02 12:23:23 [kernel_config.py:40] can not find config_path /ws/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=3072,out_dtype=torch.bfloat16}_MTT_S5000.json kernel name moe_silu_and_mul_kernel use default kernel setting
warming up: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:15<00:00,  1.29s/it]
INFO 01-02 12:23:30 [basemodel.py:812] begin check max_len infer
INFO 01-02 12:23:30 [basemodel.py:849] check max_len 8448 infer ok
INFO 01-02 12:23:45 [base_backend.py:185] loaded model class <class 'lightllm.models.qwen3.model.Qwen3TpPartModel'>
INFO 01-02 12:23:45 [manager.py:196] use req queue ChunkedPrefillQueue
INFO 01-02 12:23:45 [start_utils.py:37] init func start_router_process : init ok
INFO 01-02 12:23:45 [start_utils.py:37] init func start_detokenization_process : init ok
INFO 01-02 12:23:45 [api_start.py:58] start process pid 30307
INFO 01-02 12:23:45 [api_start.py:59] http server pid 54746
[2026-01-02 12:23:45 +0800] [54746] [INFO] Starting gunicorn 23.0.0
[2026-01-02 12:23:45 +0800] [54746] [INFO] Listening at: http://0.0.0.0:8000 (54746)
[2026-01-02 12:23:45 +0800] [54746] [INFO] Using worker: uvicorn.workers.UvicornWorker
[2026-01-02 12:23:45 +0800] [54966] [INFO] Booting worker with pid: 54966
WARNING 01-02 12:23:51 [sgl_utils.py:29] sgl_kernel is not installed, or the installed version did not support fa3.         Try to upgrade it.
WARNING 01-02 12:23:51 [light_utils.py:13] lightllm_kernel is not installed, you can't use the api of it.
INFO 01-02 12:23:52 [__init__.py:36] Available plugins for group vllm.platform_plugins:
INFO 01-02 12:23:52 [__init__.py:38] - musa -> vllm_musa:register
INFO 01-02 12:23:52 [__init__.py:41] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load.
INFO 01-02 12:23:52 [__init__.py:232] Platform plugin musa is activated
WARNING 01-02 12:23:52 [vllm_utils.py:18] vllm is not installed, you can't use the api of it.                    You can solve it by running `pip install vllm`.
INFO 01-02 12:23:52 [communication_op.py:57] deep_ep is not installed, you can't use the api of it.
INFO 01-02 12:23:52 [cache_tensor_manager.py:17] USE_GPU_TENSOR_CACHE is On
WARNING 01-02 12:23:52 [grouped_fused_moe_ep.py:28] no deepep or deep_gemm
[2026-01-02 12:23:52 +0800] [54966] [INFO] Started server process [54966]
[2026-01-02 12:23:52 +0800] [54966] [INFO] Waiting for application startup.
INFO 01-02 12:23:52 [api_http.py:359] server start up
2026-01-02 12:23:53 | server | 140684395422848 | INFO : accepted ('127.0.0.1', 55128) with fd 26
2026-01-02 12:23:53 | server | 140653227558464 | INFO : welcome ('127.0.0.1', 55128)
2026-01-02 12:23:53 | server | 140684395422848 | INFO : accepted ('127.0.0.1', 55144) with fd 27
2026-01-02 12:23:53 | server | 140653219165760 | INFO : welcome ('127.0.0.1', 55144)
INFO 01-02 12:23:54 [req_id_generator.py:34] ReqIDGenerator init finished
INFO 01-02 12:23:54 [api_http.py:363] server start up ok, loop use is <uvloop.Loop running=True closed=False debug=False>
[2026-01-02 12:23:54 +0800] [54966] [INFO] Application startup complete.
INFO 01-02 12:23:58 [manager.py:417] recieved req X-Request-Id: X-Session-Id: start_time:2026-01-02 12:23:58 lightllm_req_id:8
INFO 01-02 12:23:58 [manager.py:424] router recive req id 8 cost time 0.05271601676940918 s
DEBUG 01-02 12:23:58 [manager.py:322] Prefill Batch: batch_id=-1, time:1767327838.6764812s req_ids:[8]
DEBUG 01-02 12:23:58 [manager.py:322]
INFO 01-02 12:23:58 [manager.py:55] detokenization recv req id 8 cost time 0.0744318962097168 s
INFO 01-02 12:23:59 [manager.py:163] detoken release req id 8
INFO 01-02 12:23:59 [manager.py:611] X-Request-Id: X-Session-Id: start_time:2026-01-02 12:23:58 lightllm_req_id:8 first_token_cost:409.63053703308105ms total_cost_time:907.1474075317383ms,out_token_counter:17 mean_per_token_cost_time: 29.265698264626895ms prompt_token_num:4 gpu cache hit: False gpu_prompt_cache_len:0 gpu_prompt_cache_ratio:0.0 cpu cache hit: False cpu_prompt_cache_len:0 cpu_prompt_cache_ratio:0.0 disk cache hit: False disk_prompt_cache_len:0 disk_prompt_cache_ratio:0.0 mtp_avg_token_per_step:1.0
127.0.0.1:38158 - "POST /generate HTTP/1.1" 200
DEBUG 01-02 12:23:59 [req_manager.py:78] freed all request size 1008
DEBUG 01-02 12:23:59 [infer_batch.py:172] free a batch state:
DEBUG 01-02 12:23:59 [infer_batch.py:172] radix refed token num 0
DEBUG 01-02 12:23:59 [infer_batch.py:172] radix hold token num 21
DEBUG 01-02 12:23:59 [infer_batch.py:172] mem manager can alloc token num 649603
DEBUG 01-02 12:23:59 [infer_batch.py:172] mem manager total size 649624
INFO 01-02 12:23:59 [batch.py:56] router release req id 8
INFO 01-02 12:23:59 [shm_req_manager.py:111] all shm req has been release ok
```

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
---
 lightllm/__init__.py                          |  4 +++
 .../token_attention_nopad_att1.py             |  3 +-
 lightllm/utils/device_utils.py                | 31 +++++++++++++++----
 3 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/lightllm/__init__.py b/lightllm/__init__.py
index e69de29bb..e9ba6f304 100644
--- a/lightllm/__init__.py
+++ b/lightllm/__init__.py
@@ -0,0 +1,4 @@
+from lightllm.utils.device_utils import is_musa
+
+if is_musa():
+    import torchada  # noqa: F401
diff --git a/lightllm/models/llama/triton_kernel/token_attention_nopad_att1.py b/lightllm/models/llama/triton_kernel/token_attention_nopad_att1.py
index eb5af6fec..45de83e98 100644
--- a/lightllm/models/llama/triton_kernel/token_attention_nopad_att1.py
+++ b/lightllm/models/llama/triton_kernel/token_attention_nopad_att1.py
@@ -60,7 +60,8 @@ def _fwd_kernel_token_att1(
         ).to(tl.int64)
         off_k = k_loc[:, None] * stride_kbs + cur_kv_head * stride_kh + offs_d[None, :] * stride_kd
         k = tl.load(K + off_k, mask=offs_n_new[:, None] < cur_batch_end_index, other=0.0)
-        att_value = tl.sum(q[None, :] * k, 1, dtype=tl.float32)
+        att_value = tl.sum(q[None, :] * k, 1)
+        att_value = att_value.to(tl.float32)
         att_value *= sm_scale
         off_o = cur_head * att_stride_h + (cur_batch_in_all_start_index + offs_n) * att_stride_bs
         tl.store(Att_Out + off_o, att_value, mask=offs_n_new < cur_batch_end_index)
diff --git a/lightllm/utils/device_utils.py b/lightllm/utils/device_utils.py
index cd48a355b..09d7a680f 100644
--- a/lightllm/utils/device_utils.py
+++ b/lightllm/utils/device_utils.py
@@ -81,11 +81,14 @@ def calcu_kernel_best_vsm_count(kernel, num_warps):
     return num_sm
 
 
+@lru_cache(maxsize=1)
+def is_musa():
+    return hasattr(torch.version, "musa") and torch.version.musa is not None
+
+
 @lru_cache(maxsize=None)
 def get_current_device_name():
-    import torch
-
-    if torch.cuda.is_available():
+    if torch.cuda.is_available() or is_musa():
         device = torch.cuda.current_device()
         gpu_name = torch.cuda.get_device_name(device)
         # 4090 trans to 4090 D
@@ -103,8 +106,6 @@ def init_p2p(device_index):
     """
     torch 调用跨卡的to操作后，triton编译的算子便能自动操作跨卡tensor。
     """
-    import torch
-
     num_gpus = torch.cuda.device_count()
     tensor = torch.zeros((1,))
     tensor = tensor.to(f"cuda:{device_index}")
@@ -127,8 +128,26 @@ def has_nvlink():
         result = result.decode("utf-8")
         # Check if the output contains 'NVLink'
         return any(f"NV{i}" in result for i in range(1, 8))
+    except FileNotFoundError:
+        # nvidia-smi is not installed, assume no NVLink
+        return False
+    except subprocess.CalledProcessError:
+        # If there's an error while executing nvidia-smi, assume no NVLink
+        return False
+
+
+def has_mtlink():
+    try:
+        # Call mthreads-gmi to get the topology matrix
+        result = subprocess.check_output(["mthreads-gmi", "topo", "--matrix"])
+        result = result.decode("utf-8")
+        # Check if the output contains 'MTLink'
+        return any(f"MT{i}" in result for i in range(1, 8))
+    except FileNotFoundError:
+        # mthreads-gmi is not installed, assume no MTLink
+        return False
     except subprocess.CalledProcessError:
-        # If there's an error (e.g., nvidia-smi is not installed or another issue), assume no NVLink
+        # If there's an error while executing mthreads-gmi, assume no MTLink
         return False
 
 

From e96b3d440d7fac4405b69d9f39a9ee4851ee254c Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Thu, 8 Jan 2026 11:21:58 +0000
Subject: [PATCH 2/3] update openai_api

---
 lightllm/server/api_models.py   | 16 +++++++++++++---
 lightllm/server/api_openai.py   |  3 ++-
 lightllm/server/build_prompt.py |  6 +++++-
 3 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/lightllm/server/api_models.py b/lightllm/server/api_models.py
index 7b9cdd501..df684d4dd 100644
--- a/lightllm/server/api_models.py
+++ b/lightllm/server/api_models.py
@@ -1,7 +1,7 @@
 import time
 import uuid
 
-from pydantic import BaseModel, Field, field_validator, model_validator
+from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
 from typing import Any, Dict, List, Optional, Union, Literal, ClassVar
 from transformers import GenerationConfig
 
@@ -21,6 +21,14 @@ class Message(BaseModel):
     content: Union[str, List[MessageContent]]
 
 
+class CharacterMessage(BaseModel):
+    """Message format for character-based chat, where role is inferred from name."""
+
+    name: str
+    content: Union[str, List[MessageContent]]
+    role: Optional[str] = None  # Optional, can be inferred from role_setting
+
+
 class Function(BaseModel):
     """Function descriptions."""
 
@@ -105,7 +113,7 @@ def _normalize_role(cls, v):
         raise ValueError("'role' must be a string")
 
 
-ChatCompletionMessageParam = Union[ChatCompletionMessageGenericParam, Message]
+ChatCompletionMessageParam = Union[ChatCompletionMessageGenericParam, Message, CharacterMessage]
 
 
 class CompletionRequest(BaseModel):
@@ -176,6 +184,8 @@ def apply_loaded_defaults(cls, data: Any):
 
 
 class ChatCompletionRequest(BaseModel):
+    model_config = ConfigDict(populate_by_name=True)
+
     model: str
     messages: List[ChatCompletionMessageParam]
     function_call: Optional[str] = "none"
@@ -216,7 +226,7 @@ class ChatCompletionRequest(BaseModel):
     top_k: Optional[int] = -1
     repetition_penalty: Optional[float] = 1.0
     ignore_eos: Optional[bool] = False
-    role_settings: Optional[Dict[str, str]] = None
+    role_settings: Optional[Dict[str, str]] = Field(default=None, alias="role_setting")
     character_settings: Optional[List[Dict[str, str]]] = None
 
     # Class variables to store loaded default values
diff --git a/lightllm/server/api_openai.py b/lightllm/server/api_openai.py
index 6a8c232dc..cdac4ab5c 100644
--- a/lightllm/server/api_openai.py
+++ b/lightllm/server/api_openai.py
@@ -105,7 +105,8 @@ def _get_history_tool_calls_cnt(request: ChatCompletionRequest) -> int:
     messages = getattr(request, "messages", [])
     idx = 0
     for msg in messages:
-        if msg.role == "assistant":
+        role = getattr(msg, "role", None)
+        if role == "assistant":
             tool_calls = getattr(msg, "tool_calls", None)
             idx += len(list(tool_calls)) if tool_calls is not None else 0  # noqa
     return idx
diff --git a/lightllm/server/build_prompt.py b/lightllm/server/build_prompt.py
index f770459a5..131ee76b4 100644
--- a/lightllm/server/build_prompt.py
+++ b/lightllm/server/build_prompt.py
@@ -17,7 +17,11 @@ async def build_prompt(request, tools) -> str:
     global tokenizer
     # pydantic格式转成dict， 否则，当根据tokenizer_config.json拼template时，Jinja判断无法识别
     messages = [m.model_dump(by_alias=True, exclude_none=True) for m in request.messages]
-    kwargs = {"conversation": messages}
+    kwargs = {
+        "conversation": messages,
+        # 假设 request 对象里有这个字段，或者你想传空
+        "system_instruction": getattr(request, "system_instruction", ""),
+    }
     if request.character_settings:
         kwargs["character_settings"] = request.character_settings
     if request.role_settings:

From 40382c89dd86a55b9d56ea212afce9e64203f793 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Thu, 8 Jan 2026 11:58:40 +0000
Subject: [PATCH 3/3] fix

---
 lightllm/server/api_models.py   | 1 +
 lightllm/server/build_prompt.py | 7 ++-----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/lightllm/server/api_models.py b/lightllm/server/api_models.py
index df684d4dd..de6d6ba20 100644
--- a/lightllm/server/api_models.py
+++ b/lightllm/server/api_models.py
@@ -228,6 +228,7 @@ class ChatCompletionRequest(BaseModel):
     ignore_eos: Optional[bool] = False
     role_settings: Optional[Dict[str, str]] = Field(default=None, alias="role_setting")
     character_settings: Optional[List[Dict[str, str]]] = None
+    system_instruction: Optional[str] = None
 
     # Class variables to store loaded default values
     _loaded_defaults: ClassVar[Dict[str, Any]] = {}
diff --git a/lightllm/server/build_prompt.py b/lightllm/server/build_prompt.py
index 131ee76b4..cff2ec127 100644
--- a/lightllm/server/build_prompt.py
+++ b/lightllm/server/build_prompt.py
@@ -17,15 +17,12 @@ async def build_prompt(request, tools) -> str:
     global tokenizer
     # pydantic格式转成dict， 否则，当根据tokenizer_config.json拼template时，Jinja判断无法识别
     messages = [m.model_dump(by_alias=True, exclude_none=True) for m in request.messages]
-    kwargs = {
-        "conversation": messages,
-        # 假设 request 对象里有这个字段，或者你想传空
-        "system_instruction": getattr(request, "system_instruction", ""),
-    }
+    kwargs = {"conversation": messages}
     if request.character_settings:
         kwargs["character_settings"] = request.character_settings
     if request.role_settings:
         kwargs["role_setting"] = request.role_settings
+    kwargs["system_instruction"] = request.system_instruction
 
     if request.chat_template_kwargs:
         kwargs.update(request.chat_template_kwargs)