From c6f3f3175282522aa07467fe5213b73b2ec9b013 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Fri, 5 Dec 2025 07:13:58 +0000 Subject: [PATCH 1/3] fix hang Signed-off-by: Lucas Wilkinson --- vllm/v1/worker/gpu_model_runner.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 152bea2c0975..cfecafccbf5b 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -4104,10 +4104,17 @@ def _dummy_run( if self.speculative_config and self.speculative_config.use_eagle(): assert isinstance(self.drafter, EagleProposer) + # since eagle currently only supports PIECEWISE cudagraphs if we are + # capturing cudagraphs only capture for PIECEWISE cudagraphs use + # PIECEWISE cudagraphs if the main model used cudagraphs. + # NOTE(lucas): this is a hack, need to clean up. use_cudagraphs = ( - cudagraph_runtime_mode.has_mode(CUDAGraphMode.PIECEWISE) - and not self.speculative_config.enforce_eager - ) + ( + cudagraph_runtime_mode.has_mode(CUDAGraphMode.PIECEWISE) + and is_graph_capturing + ) + or (cudagraph_runtime_mode != CUDAGraphMode.NONE) + ) and not self.speculative_config.enforce_eager # Note(gnovack) - We need to disable cudagraphs for one of the two # lora cases when cudagraph_specialize_lora is enabled. This is a From 71019e3722149e91af58e4cf29ab9103d859e17f Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Fri, 5 Dec 2025 15:35:25 +0000 Subject: [PATCH 2/3] cleanup Signed-off-by: Lucas Wilkinson --- vllm/v1/worker/gpu_model_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index cfecafccbf5b..442ac5e210b0 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -4110,8 +4110,8 @@ def _dummy_run( # NOTE(lucas): this is a hack, need to clean up. use_cudagraphs = ( ( - cudagraph_runtime_mode.has_mode(CUDAGraphMode.PIECEWISE) - and is_graph_capturing + is_graph_capturing + and cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE ) or (cudagraph_runtime_mode != CUDAGraphMode.NONE) ) and not self.speculative_config.enforce_eager From 3f56bb09be4e3733d1176665caa43b7bb496f6c9 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Fri, 5 Dec 2025 14:14:13 -0500 Subject: [PATCH 3/3] Update vllm/v1/worker/gpu_model_runner.py Co-authored-by: Tyler Michael Smith Signed-off-by: Lucas Wilkinson --- vllm/v1/worker/gpu_model_runner.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 442ac5e210b0..14459485d7d2 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -4104,9 +4104,8 @@ def _dummy_run( if self.speculative_config and self.speculative_config.use_eagle(): assert isinstance(self.drafter, EagleProposer) - # since eagle currently only supports PIECEWISE cudagraphs if we are - # capturing cudagraphs only capture for PIECEWISE cudagraphs use - # PIECEWISE cudagraphs if the main model used cudagraphs. + # Eagle currently only supports PIECEWISE cudagraphs. + # Therefore only use cudagraphs if the main model uses PIECEWISE # NOTE(lucas): this is a hack, need to clean up. use_cudagraphs = ( (