File tree Expand file tree Collapse file tree 2 files changed +7
-4
lines changed
Expand file tree Collapse file tree 2 files changed +7
-4
lines changed Original file line number Diff line number Diff line change @@ -107,6 +107,8 @@ def _compute_prefix_caching_block_indices(
107107 )
108108 # -1 in case it's non-computed and causes later issues with indexing
109109 block_idx_last_computed_token = block_idx_last_computed_token .clamp (min = 0 )
110+ # -1 in the case we have a padded request (0 seq-len)
111+ block_idx_last_scheduled_token = block_idx_last_scheduled_token .clamp (min = 0 )
110112
111113 return (
112114 block_idx_last_computed_token ,
Original file line number Diff line number Diff line change @@ -1486,13 +1486,14 @@ def _build_attention_metadata(
14861486 )
14871487 else :
14881488 blk_table = self .input_batch .block_table [kv_cache_gid ]
1489- # Fill unused with -1. Needed for reshape_and_cache in full cuda
1490- # graph mode.
1491- blk_table .slot_mapping .gpu [num_tokens :].fill_ (- 1 )
1492-
14931489 blk_table_tensor = blk_table .get_device_tensor (num_reqs_padded )
14941490 slot_mapping = blk_table .slot_mapping .gpu [:num_tokens_padded ]
14951491
1492+ # Fill unused with -1. Needed for reshape_and_cache in full cuda
1493+ # graph mode. `blk_table_tensor` -1 to match mamba PAD_SLOT_ID
1494+ slot_mapping [num_tokens :num_tokens_padded ].fill_ (- 1 )
1495+ blk_table_tensor [num_reqs :num_reqs_padded ].fill_ (- 1 )
1496+
14961497 common_attn_metadata = CommonAttentionMetadata (
14971498 query_start_loc = query_start_loc ,
14981499 query_start_loc_cpu = query_start_loc_cpu ,
You can’t perform that action at this time.
0 commit comments