fix mamba

LucasWilkinson · LucasWilkinson · commit a81c2e3d56dd · 2025-11-21T04:54:43.000Z
Signed-off-by: Lucas Wilkinson &lt;lwilkins@redhat.com&gt;
diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py
@@ -107,6 +107,8 @@ def _compute_prefix_caching_block_indices(
         )
         # -1 in case it's non-computed and causes later issues with indexing
         block_idx_last_computed_token = block_idx_last_computed_token.clamp(min=0)
+        # -1 in the case we have a padded request (0 seq-len)
+        block_idx_last_scheduled_token = block_idx_last_scheduled_token.clamp(min=0)
 
         return (
             block_idx_last_computed_token,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -1486,13 +1486,14 @@ def _build_attention_metadata(
                 )
             else:
                 blk_table = self.input_batch.block_table[kv_cache_gid]
-                # Fill unused with -1. Needed for reshape_and_cache in full cuda
-                # graph mode.
-                blk_table.slot_mapping.gpu[num_tokens:].fill_(-1)
-
                 blk_table_tensor = blk_table.get_device_tensor(num_reqs_padded)
                 slot_mapping = blk_table.slot_mapping.gpu[:num_tokens_padded]
 
+                # Fill unused with -1. Needed for reshape_and_cache in full cuda
+                # graph mode. `blk_table_tensor` -1 to match mamba PAD_SLOT_ID
+                slot_mapping[num_tokens:num_tokens_padded].fill_(-1)
+                blk_table_tensor[num_reqs:num_reqs_padded].fill_(-1)
+
             common_attn_metadata = CommonAttentionMetadata(
                 query_start_loc=query_start_loc,
                 query_start_loc_cpu=query_start_loc_cpu,

Original file line number	Diff line number	Diff line change
`@@ -107,6 +107,8 @@ def _compute_prefix_caching_block_indices(`
`107`	`107`	`)`
`108`	`108`	`# -1 in case it's non-computed and causes later issues with indexing`
`109`	`109`	`block_idx_last_computed_token = block_idx_last_computed_token.clamp(min=0)`
	`110`	`+ # -1 in the case we have a padded request (0 seq-len)`
	`111`	`+ block_idx_last_scheduled_token = block_idx_last_scheduled_token.clamp(min=0)`
`110`	`112`
`111`	`113`	`return (`
`112`	`114`	`block_idx_last_computed_token,`