diff --git a/tests/v1/distributed/test_dbo.py b/tests/v1/distributed/test_dbo.py index 866ae742bf3c..16f154d196ba 100644 --- a/tests/v1/distributed/test_dbo.py +++ b/tests/v1/distributed/test_dbo.py @@ -85,5 +85,4 @@ def test_dbo_dp_ep_gsm8k(all2all_backend: str, num_gpus_available): assert accuracy >= MIN_ACCURACY, ( f"DBO+DP+EP accuracy too low ({all2all_backend}): " f"{accuracy:.3f} < {MIN_ACCURACY:.3f} " - f"(correct: {results['num_correct']}/{results['num_questions']})" ) diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 27f07218d9b2..8edfbb5140bc 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -166,9 +166,7 @@ def _make_metadata_with_slice( assert start_locs[first_req] <= first_tok < start_locs[first_req + 1], ( "Token slice start outside of first request" ) - assert start_locs[last_req] <= last_tok < start_locs[last_req + 1], ( - "Token slice end outside of last request" - ) + # NOTE: last token can be outside of the last request if we have CG padding. # If the "middle" request has tokens in both ubatches, we have to split it. # If ubatch_slice is the first ubatch then we will be splitting the last diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py index c1509de821b0..6539d72d81cb 100644 --- a/vllm/v1/worker/dp_utils.py +++ b/vllm/v1/worker/dp_utils.py @@ -93,13 +93,16 @@ def _post_process_dp_padding(tensor: torch.Tensor, should_dp_pad: bool) -> torch # This just pads the second ubatch slice out to the total number of tokens # (num_tokens + padding) since we do `create_ubatch_slices` before applying DP padding. -def _pad_out_ubatch_slice(ubatch_slices: UBatchSlices, num_total_tokens: int): - padded_second_ubatch_slice = slice( +def _pad_out_ubatch_slice( + ubatch_slices: UBatchSlices, num_total_tokens: int +) -> UBatchSlices: + padded_second_token_slice = slice( ubatch_slices[1].token_slice.start, num_total_tokens ) ubatch_slices[1] = UBatchSlice( - padded_second_ubatch_slice, padded_second_ubatch_slice + ubatch_slices[1].request_slice, padded_second_token_slice ) + return ubatch_slices def _synchronize_dp_ranks(