Skip to content

Commit 7b5575f

Browse files
authored
[Bug] Fix vLLM config is not set error (#29999)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
1 parent 77e4472 commit 7b5575f

File tree

5 files changed

+47
-27
lines changed

5 files changed

+47
-27
lines changed

vllm/model_executor/layers/fused_moe/cutlass_moe.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -460,6 +460,7 @@ def cutlass_moe_fp8(
460460
expert_map: torch.Tensor | None = None,
461461
apply_router_weight_on_input: bool = False,
462462
global_num_experts: int = -1,
463+
parallel_config=None,
463464
) -> torch.Tensor:
464465
"""
465466
This function computes a a8w8-quantized Mixture of Experts (MoE) layer
@@ -537,6 +538,7 @@ def cutlass_moe_fp8(
537538
c_strides2=c_strides2,
538539
quant_config=quant_config,
539540
),
541+
parallel_config=parallel_config,
540542
)
541543

542544
return fn(

vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,13 +44,19 @@ def make(
4444
prepare_finalize: FusedMoEPrepareAndFinalize,
4545
shared_experts: torch.nn.Module | None,
4646
) -> "FusedMoEModularMethod":
47+
parallel_config = getattr(
48+
getattr(moe_layer, "vllm_config", None),
49+
"parallel_config",
50+
None,
51+
)
4752
return FusedMoEModularMethod(
4853
old_quant_method,
4954
FusedMoEModularKernel(
5055
prepare_finalize,
5156
old_quant_method.select_gemm_impl(prepare_finalize, moe_layer),
5257
shared_experts,
5358
getattr(moe_layer, "shared_experts_stream", None),
59+
parallel_config=parallel_config,
5460
),
5561
)
5662

vllm/model_executor/layers/fused_moe/modular_kernel.py

Lines changed: 30 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import torch
1111

1212
import vllm.envs as envs
13-
from vllm.config import get_current_vllm_config
13+
from vllm.config import ParallelConfig, get_current_vllm_config
1414
from vllm.forward_context import get_forward_context, is_forward_context_available
1515
from vllm.logger import init_logger
1616
from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
@@ -716,13 +716,22 @@ def __init__(
716716
fused_experts: FusedMoEPermuteExpertsUnpermute,
717717
shared_experts: torch.nn.Module | None = None,
718718
shared_experts_stream: torch.cuda.Stream | None = None,
719+
parallel_config: ParallelConfig | None = None,
719720
):
720721
super().__init__()
721722
self.prepare_finalize = prepare_finalize
722723
self.fused_experts = fused_experts
723724
self.shared_experts = shared_experts
724725
self.shared_experts_stream = shared_experts_stream
725726

727+
# cache whether this worker is using DP+EP
728+
if parallel_config is None:
729+
parallel_config = get_current_vllm_config().parallel_config
730+
self.is_dp_ep = (
731+
parallel_config.data_parallel_size > 1
732+
and parallel_config.enable_expert_parallel
733+
)
734+
726735
self._post_init_setup()
727736
assert (
728737
prepare_finalize.activation_format == fused_experts.activation_formats[0]
@@ -811,33 +820,27 @@ def _allocate_buffers(
811820
is_forward_context_available()
812821
and get_forward_context().attn_metadata is None
813822
)
814-
if is_profile_run and self.fused_experts.supports_chunking():
815-
parallel_config = get_current_vllm_config().parallel_config
816-
is_dp_ep = (
817-
parallel_config.data_parallel_size > 1
818-
and parallel_config.enable_expert_parallel
819-
)
820-
if is_dp_ep:
821-
max_workspace_13, max_workspace_2, max_fused_out_shape = (
822-
self.fused_experts.workspace_shapes(
823-
envs.VLLM_FUSED_MOE_CHUNK_SIZE,
824-
N,
825-
K,
826-
top_k,
827-
global_num_experts,
828-
local_num_experts,
829-
expert_tokens_meta,
830-
)
831-
)
832-
buffers.workspace13.get(
833-
max_workspace_13, device=device, dtype=workspace_dtype
834-
)
835-
buffers.workspace2.get(
836-
max_workspace_2, device=device, dtype=workspace_dtype
837-
)
838-
buffers.fused_out.get(
839-
max_fused_out_shape, device=device, dtype=workspace_dtype
823+
if is_profile_run and self.fused_experts.supports_chunking() and self.is_dp_ep:
824+
max_workspace_13, max_workspace_2, max_fused_out_shape = (
825+
self.fused_experts.workspace_shapes(
826+
envs.VLLM_FUSED_MOE_CHUNK_SIZE,
827+
N,
828+
K,
829+
top_k,
830+
global_num_experts,
831+
local_num_experts,
832+
expert_tokens_meta,
840833
)
834+
)
835+
buffers.workspace13.get(
836+
max_workspace_13, device=device, dtype=workspace_dtype
837+
)
838+
buffers.workspace2.get(
839+
max_workspace_2, device=device, dtype=workspace_dtype
840+
)
841+
buffers.fused_out.get(
842+
max_fused_out_shape, device=device, dtype=workspace_dtype
843+
)
841844

842845
# Get intermediate workspace shapes based off the chunked M size.
843846
workspace13_shape, workspace2_shape, _ = self.fused_experts.workspace_shapes(

vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1287,6 +1287,9 @@ def apply(
12871287
ab_strides2=self.ab_strides2,
12881288
c_strides1=self.c_strides1,
12891289
c_strides2=self.ab_strides1_c_strides2,
1290+
parallel_config=getattr(
1291+
getattr(layer, "vllm_config", None), "parallel_config", None
1292+
),
12901293
)
12911294

12921295
else:

vllm/model_executor/layers/quantization/utils/flashinfer_utils.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,11 @@ def flashinfer_cutlass_moe_fp8(
247247
assert quant_config is not None
248248

249249
# Construct modular kernel with block-scale support when requested.
250+
parallel_config = getattr(
251+
getattr(layer, "vllm_config", None),
252+
"parallel_config",
253+
None,
254+
)
250255
fused_experts = mk.FusedMoEModularKernel(
251256
build_flashinfer_fp8_cutlass_moe_prepare_finalize(
252257
moe=moe, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale
@@ -257,6 +262,7 @@ def flashinfer_cutlass_moe_fp8(
257262
out_dtype=hidden_states.dtype,
258263
use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
259264
),
265+
parallel_config=parallel_config,
260266
)
261267

262268
return fused_experts(

0 commit comments

Comments
 (0)