diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 89318004b..084893e11 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -242,7 +242,7 @@ qwen3.5-fp8-mi300x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } glm5-fp8-mi355x-sglang: - image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260219 + image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260413 model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: mi355x diff --git a/benchmarks/single_node/glm5_fp8_mi355x.sh b/benchmarks/single_node/glm5_fp8_mi355x.sh index 3d82fd856..668120e57 100755 --- a/benchmarks/single_node/glm5_fp8_mi355x.sh +++ b/benchmarks/single_node/glm5_fp8_mi355x.sh @@ -49,7 +49,9 @@ python3 -m sglang.launch_server \ --mem-fraction-static 0.85 \ --model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 8}' \ --nsa-prefill-backend tilelang \ - --nsa-decode-backend tilelang $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --nsa-decode-backend tilelang $EVAL_CONTEXT_ARGS \ + --kv-cache-dtype fp8_e4m3 \ + --disable-radix-cache> $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 6721dbb1e..73f0d86ce 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1353,6 +1353,13 @@ - "TP2/TP4 seach space exploration for Qwen3.5 fp4 on SGL" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1022 +- config-keys: + - glm5-fp8-mi355x-sglang + description: + - "Upgrade GLM5 FP8 MI355X SGLang image to v0.5.10rc0-rocm720-mi35x-20260413" + - "Set --kv-cache-dtype fp8_e4m3 and --disable-radix-cache" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1023 + - config-keys: - qwen3.5-fp8-h200-sglang-mtp description: