diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e786cd8bc7c9..88cecd27ac7c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3727,6 +3727,31 @@ def _dummy_run( dp_rank = self.parallel_config.data_parallel_rank num_tokens_after_padding = int(num_tokens_across_dp[dp_rank]) + # filter out the valid batch descriptor + _cg_mode, batch_descriptor = ( + self.cudagraph_dispatcher.dispatch( + BatchDescriptor( + num_tokens=num_tokens_after_padding, + uniform_decode=uniform_decode, + has_lora=activate_lora and self.lora_config is not None, + ) + ) + if not is_profile + else (CUDAGraphMode.NONE, None) + ) + if cudagraph_runtime_mode is not None: + # we allow forcing NONE when the dispatcher disagrees to support + # warm ups for cudagraph capture + assert ( + cudagraph_runtime_mode == CUDAGraphMode.NONE + or cudagraph_runtime_mode == _cg_mode + ), ( + f"Cudagraph runtime mode mismatch at dummy_run. " + f"Expected {_cg_mode}, but got {cudagraph_runtime_mode}." + ) + else: + cudagraph_runtime_mode = _cg_mode + attn_metadata: PerLayerAttnMetadata | None = None # If force_attention is True, we always capture attention. Otherwise, @@ -3801,31 +3826,6 @@ def _dummy_run( num_tokens_after_padding, None, False ) - # filter out the valid batch descriptor - _cg_mode, batch_descriptor = ( - self.cudagraph_dispatcher.dispatch( - BatchDescriptor( - num_tokens=num_tokens_after_padding, - uniform_decode=uniform_decode, - has_lora=activate_lora and self.lora_config is not None, - ) - ) - if not is_profile - else (CUDAGraphMode.NONE, None) - ) - if cudagraph_runtime_mode is not None: - # we allow forcing NONE when the dispatcher disagrees to support - # warm ups for cudagraph capture - assert ( - cudagraph_runtime_mode == CUDAGraphMode.NONE - or cudagraph_runtime_mode == _cg_mode - ), ( - f"Cudagraph runtime mode mismatch at dummy_run. " - f"Expected {_cg_mode}, but got {cudagraph_runtime_mode}." - ) - else: - cudagraph_runtime_mode = _cg_mode - if ubatch_slices is not None: # Adjust values to reflect a single ubatch. # TODO(sage,lucas): this is cruft that should be addressed in