test fix

LucasWilkinson · LucasWilkinson · commit f849175ce3d1 · 2025-11-21T16:26:56.000Z
Signed-off-by: Lucas Wilkinson &lt;lwilkins@redhat.com&gt;
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
@@ -31,7 +31,6 @@ class CudagraphDispatcher:
     def __init__(self, vllm_config: VllmConfig):
         self.vllm_config = vllm_config
         self.compilation_config = vllm_config.compilation_config
-        self.cudagraph_mode = self.compilation_config.cudagraph_mode
         self.uniform_decode_query_len = (
             1
             if not self.vllm_config.speculative_config
@@ -44,12 +43,8 @@ def __init__(self, vllm_config: VllmConfig):
             CUDAGraphMode.FULL: set(),
         }
 
-        not_use_piecewise_compilation = (
-            not self.cudagraph_mode.requires_piecewise_compilation()
-        )
-
         assert (
-            not_use_piecewise_compilation
+            self.compilation_config.cudagraph_mode.requires_piecewise_compilation()
             or self.compilation_config.is_attention_compiled_piecewise()
         ), (
             "Compilation mode should be CompilationMode.VLLM_COMPILE when "
@@ -75,6 +70,7 @@ def _create_padded_batch_descriptor(
             assert num_tokens_padded % uniform_decode_query_len == 0
             assert num_reqs <= max_num_seqs
         else:
+            uniform_decode = False
             num_reqs = min(num_tokens_padded, max_num_seqs)
 
         return BatchDescriptor(
@@ -95,7 +91,9 @@ def add_cudagraph_key(
     def initialize_cudagraph_keys(
         self, cudagraph_mode: CUDAGraphMode, uniform_decode_query_len: int
     ):
-        # This should be called only after attention backend is initialized.
+        # This should be called only after attention backend is initialized. So we can
+        # get the correct cudagraph mode after backend support is resolved.
+        self.cudagraph_mode = cudagraph_mode
 
         # LoRA activation cases to specialize the cuda graphs on
         if self.vllm_config.lora_config:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -4611,8 +4611,7 @@ def _check_and_update_cudagraph_mode(
 
         # Trigger cudagraph dispatching keys initialization after
         # resolved cudagraph mode.
-        cudagraph_mode = self.compilation_config.cudagraph_mode
-        assert cudagraph_mode is not None
+        self.compilation_config.cudagraph_mode = cudagraph_mode
         self.cudagraph_dispatcher.initialize_cudagraph_keys(
             cudagraph_mode, self.uniform_decode_query_len
         )

Original file line number	Diff line number	Diff line change
`@@ -4611,8 +4611,7 @@ def _check_and_update_cudagraph_mode(`
`4611`	`4611`
`4612`	`4612`	`# Trigger cudagraph dispatching keys initialization after`
`4613`	`4613`	`# resolved cudagraph mode.`
`4614`		`- cudagraph_mode = self.compilation_config.cudagraph_mode`
`4615`		`- assert cudagraph_mode is not None`
	`4614`	`+ self.compilation_config.cudagraph_mode = cudagraph_mode`
`4616`	`4615`	`self.cudagraph_dispatcher.initialize_cudagraph_keys(`
`4617`	`4616`	`cudagraph_mode, self.uniform_decode_query_len`
`4618`	`4617`	`)`