diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 70a79a273..bb49bb251 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2137,7 +2137,7 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: dsv4-fp4-mi355x-sglang: - image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260610 + image: lmsysorg/sglang-rocm:v0.5.13-rocm720-mi35x-20260612 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: mi355x diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang.sh index 4aeebc683..ade8d96fe 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_sglang.sh @@ -60,7 +60,9 @@ start_gpu_monitor PARALLEL_ARGS=( --tensor-parallel-size "$TP" ) +CHUNKED_PREFILL_SIZE=8192 if [ "${DP_ATTENTION}" = "true" ]; then + CHUNKED_PREFILL_SIZE=$((8192 * TP)) PARALLEL_ARGS+=( --dp "$TP" --enable-dp-attention @@ -85,7 +87,7 @@ sglang serve \ --swa-full-tokens-ratio 0.15 \ --page-size 256 \ --context-length $MAX_MODEL_LEN \ - --chunked-prefill-size 8192 \ + --chunked-prefill-size $CHUNKED_PREFILL_SIZE \ --disable-shared-experts-fusion \ --tool-call-parser deepseekv4 \ --reasoning-parser deepseek-v4 \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 3ebd09db2..2af0baaaa 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3600,6 +3600,14 @@ - "MI355x DSR1-FP4: Include TP4 configurations for 8k1k" - "Expand the TP sweep (included TP=4) for 8k/1k configuration for conc=4 to 64" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1692 + +- config-keys: + - dsv4-fp4-mi355x-sglang + description: + - "Bump image to lmsysorg/sglang-rocm:v0.5.13-rocm720-mi35x-20260612." + - "Fix the intermediate_pad setting in the MoE computation in sglang PR#27858. This avoids the unnecessary overhead of computing useless padding." + - "Correct the chunk prefill setting size under tp8/dp8 config." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1715 - config-keys: - dsv4-fp4-gb200-dynamo-sglang