Skip to content

Commit a81c2e3

Browse files
fix mamba
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
1 parent 16722b9 commit a81c2e3

File tree

2 files changed

+7
-4
lines changed

2 files changed

+7
-4
lines changed

vllm/v1/attention/backends/mamba_attn.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,8 @@ def _compute_prefix_caching_block_indices(
107107
)
108108
# -1 in case it's non-computed and causes later issues with indexing
109109
block_idx_last_computed_token = block_idx_last_computed_token.clamp(min=0)
110+
# -1 in the case we have a padded request (0 seq-len)
111+
block_idx_last_scheduled_token = block_idx_last_scheduled_token.clamp(min=0)
110112

111113
return (
112114
block_idx_last_computed_token,

vllm/v1/worker/gpu_model_runner.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1486,13 +1486,14 @@ def _build_attention_metadata(
14861486
)
14871487
else:
14881488
blk_table = self.input_batch.block_table[kv_cache_gid]
1489-
# Fill unused with -1. Needed for reshape_and_cache in full cuda
1490-
# graph mode.
1491-
blk_table.slot_mapping.gpu[num_tokens:].fill_(-1)
1492-
14931489
blk_table_tensor = blk_table.get_device_tensor(num_reqs_padded)
14941490
slot_mapping = blk_table.slot_mapping.gpu[:num_tokens_padded]
14951491

1492+
# Fill unused with -1. Needed for reshape_and_cache in full cuda
1493+
# graph mode. `blk_table_tensor` -1 to match mamba PAD_SLOT_ID
1494+
slot_mapping[num_tokens:num_tokens_padded].fill_(-1)
1495+
blk_table_tensor[num_reqs:num_reqs_padded].fill_(-1)
1496+
14961497
common_attn_metadata = CommonAttentionMetadata(
14971498
query_start_loc=query_start_loc,
14981499
query_start_loc_cpu=query_start_loc_cpu,

0 commit comments

Comments
 (0)