Skip to content

Commit c3e63f9

Browse files
authored
[Misc] Improve Lora edge cases test coverage (#1830)
Signed-off-by: Jiaxin Shan <seedjeffwan@gmail.com>
1 parent f3be4b9 commit c3e63f9

File tree

4 files changed

+151
-126
lines changed

4 files changed

+151
-126
lines changed

test/integration/engine/lora/README.md

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,22 @@ This test suite validates vLLM's LoRA adapter memory management, including LRU e
77
- Python 3.10+
88
- vLLM installed with LoRA support
99
- GPU with sufficient memory (recommended: 24GB+ for Qwen3-8B)
10-
- Required packages: `pytest`, `pytest-asyncio`, `torch`, `safetensors`, `transformers`, `vllm`
10+
- Required packages: `pytest`, `pytest-asyncio`
11+
- Install vLLM.
1112

1213
```bash
13-
pip install pytest pytest-asyncio torch safetensors transformers vllm
14+
# 1. create env
15+
python3 -m venv ~/venvs/vllm
16+
source ~/venvs/vllm/bin/activate
17+
18+
# 2. Upgrade pip
19+
pip install --upgrade pip
20+
21+
# 3. Install vLLM (and whatever else you need)
22+
pip install "vllm[torch]"==0.12.0
23+
24+
# 4. Install test packages
25+
pip install pytest pytest-asyncio
1426
```
1527

1628
## Quick Start

test/integration/engine/lora/test_api.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ def test_reload_same_lora(self, llm):
117117
# Second load (same LoRA) - should be idempotent
118118
result2 = llm.llm_engine.add_lora(lora_req)
119119

120-
# Document behavior
120+
# Document behavior, both should be True.
121121
print(f"\nFirst add_lora: {result1}")
122122
print(f"Second add_lora (duplicate): {result2}")
123123

@@ -196,6 +196,38 @@ def test_pin_lora_not_found_raises(self, llm):
196196
f"Expected error about LoRA not found, got: {excinfo.value}"
197197
print("PASS: pin_lora raises exception when LoRA not found")
198198

199+
def test_remove_pinned_lora(self, llm):
200+
"""Test that pinned LoRAs can still be explicitly removed.
201+
202+
Note: pin_lora() only prevents eviction due to LRU cache pressure,
203+
it does NOT prevent explicit remove_lora() calls.
204+
"""
205+
# Add and pin a LoRA
206+
lora_req = LoRARequest("pinned_lora", 1, get_lora_path(0))
207+
llm.llm_engine.add_lora(lora_req)
208+
llm.llm_engine.pin_lora(1)
209+
210+
# Verify it's loaded
211+
loras_before = set(llm.llm_engine.list_loras())
212+
assert 1 in loras_before, "LoRA should be loaded"
213+
214+
# Remove the pinned LoRA - this should succeed
215+
remove_result = llm.llm_engine.remove_lora(1)
216+
assert remove_result is True, "remove_lora on pinned LoRA should return True"
217+
218+
# Verify it was removed
219+
loras_after = set(llm.llm_engine.list_loras())
220+
assert 1 not in loras_after, "Pinned LoRA should be removed after remove_lora()"
221+
222+
print(f"\n{'='*60}")
223+
print("REMOVE PINNED LORA")
224+
print(f"{'='*60}")
225+
print("pin_lora() prevents LRU eviction, NOT explicit remove_lora()")
226+
print(f"remove_lora(1) on pinned LoRA: {remove_result}")
227+
print(f"{'='*60}")
228+
229+
print("PASS: Pinned LoRA can be explicitly removed")
230+
199231

200232
if __name__ == "__main__":
201233
pytest.main([__file__, "-v", "-s"])

test/integration/engine/lora/test_batching_concurrency.py

Lines changed: 81 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -493,18 +493,18 @@ class TestConcurrentOperations:
493493
"""
494494

495495
@pytest_asyncio.fixture
496-
async def async_engine_edge_cases(self):
497-
"""Create AsyncLLMEngine for edge case tests.
496+
async def async_engine_single_gpu_slot(self):
497+
"""Create AsyncLLMEngine with max_loras=1 (single GPU slot).
498498
499-
max_loras=2 (GPU slots), max_cpu_loras=8 (CPU cache)
500-
This means only 2 LoRAs can be on GPU at a time.
499+
max_loras=1 (only 1 LoRA on GPU), max_cpu_loras=8 (CPU cache)
500+
This forces LoRA swapping when switching between LoRAs.
501501
"""
502502
cleanup_gpu()
503503
engine_args = AsyncEngineArgs(
504504
model=BASE_MODEL,
505505
enable_lora=True,
506-
max_loras=2, # Only 2 LoRAs on GPU at a time
507-
max_cpu_loras=8, # 8 LoRAs can be cached on CPU
506+
max_loras=1, # Only 1 LoRA on GPU at a time
507+
max_cpu_loras=8, # LoRAs can be cached on CPU
508508
max_lora_rank=16,
509509
gpu_memory_utilization=0.8,
510510
trust_remote_code=True,
@@ -520,115 +520,133 @@ async def async_engine_edge_cases(self):
520520
cleanup_gpu()
521521

522522
@pytest.mark.asyncio
523-
async def test_pin_lora_during_inflight_request(self, async_engine_edge_cases):
524-
"""Test pinning a CPU-resident LoRA while another LoRA has in-flight requests.
525-
526-
Scenario:
527-
1. Load LoRA1 and LoRA2 (both on GPU, max_loras=2)
528-
2. Load LoRA3 (goes to CPU since GPU is full)
523+
async def test_pin_cpu_lora_during_inflight_request(self, async_engine_single_gpu_slot):
524+
"""Test pinning a CPU-resident LoRA while GPU LoRA has in-flight request.
525+
note: we only test the case that all gpu loras are busy with in-flight requests,
526+
to make test simple, we use --max-loras=1
527+
528+
Scenario (max_loras=1):
529+
1. Load LoRA1 (goes to GPU - the only slot)
530+
2. Load LoRA2 (goes to CPU since GPU is full)
529531
3. Start an in-flight request using LoRA1
530-
4. While request is running, try to pin LoRA3 (on CPU)
531-
5. Document what happens: Does it block? Fail? Succeed?
532+
4. While request is running, pin LoRA2 (on CPU)
532533
533-
Key question: Does pin_lora execute DURING the request or wait until AFTER?
534+
Key question: Does pin(lora2) wait for request to finish,
535+
or does it preempt/terminate the in-flight request?
534536
"""
535-
engine = async_engine_edge_cases
537+
engine = async_engine_single_gpu_slot
536538

537-
# Load 3 LoRAs - first 2 go to GPU, 3rd goes to CPU
538-
for i in range(3):
539-
lora_req = LoRARequest(f"lora_{i}", i + 1, get_lora_path(i))
540-
await engine.add_lora(lora_req)
539+
# Load LoRA1 (goes to GPU) and LoRA2 (goes to CPU)
540+
lora1 = LoRARequest("lora_0", 1, get_lora_path(0))
541+
lora2 = LoRARequest("lora_1", 2, get_lora_path(1))
542+
await engine.add_lora(lora1)
543+
await engine.add_lora(lora2)
541544

542-
sampling_params = SamplingParams(max_tokens=100) # Longer to ensure overlap
545+
sampling_params = SamplingParams(max_tokens=100) # Long enough for overlap
543546

544-
# Track detailed timing
545547
timing = {
546548
"request_start": None,
547-
"request_first_token": None,
548549
"request_end": None,
549550
"pin_start": None,
550551
"pin_end": None,
551552
"token_count": 0,
552553
}
553554
results = {
554-
"inflight_output": None,
555+
"request_output": None,
556+
"request_error": None,
555557
"pin_result": None,
556558
"pin_error": None,
557559
}
558560

559561
async def inflight_request():
560-
"""Run an in-flight request using LoRA1, tracking token timing."""
562+
"""Run request using LoRA1 (on GPU)."""
561563
timing["request_start"] = time.perf_counter()
562-
lora_req = LoRARequest("lora_0", 1, get_lora_path(0))
563-
output_text = ""
564-
async for output in engine.generate(
565-
"Write a detailed story about a dragon and a knight: ",
566-
sampling_params,
567-
request_id="inflight_lora1",
568-
lora_request=lora_req
569-
):
570-
if timing["request_first_token"] is None:
571-
timing["request_first_token"] = time.perf_counter()
572-
timing["token_count"] = len(output.outputs[0].token_ids)
573-
output_text = output.outputs[0].text
564+
try:
565+
output_text = ""
566+
async for output in engine.generate(
567+
"Write a story about a wizard: ",
568+
sampling_params,
569+
request_id="inflight_lora1",
570+
lora_request=LoRARequest("lora_0", 1, get_lora_path(0))
571+
):
572+
timing["token_count"] = len(output.outputs[0].token_ids)
573+
output_text = output.outputs[0].text
574+
results["request_output"] = output_text
575+
except Exception as e:
576+
results["request_error"] = str(e)
574577
timing["request_end"] = time.perf_counter()
575-
results["inflight_output"] = output_text
576-
return output_text
577578

578-
async def pin_lora3_after_delay():
579-
"""Wait briefly then try to pin LoRA3 (on CPU)."""
580-
await asyncio.sleep(0.2) # Wait for request to start generating
579+
async def pin_cpu_lora_after_delay():
580+
"""Wait briefly then pin LoRA2 (on CPU)."""
581+
await asyncio.sleep(0.2) # Wait for request to start
581582

582583
timing["pin_start"] = time.perf_counter()
583584
try:
584-
pin_result = await engine.pin_lora(3)
585-
results["pin_result"] = pin_result
585+
results["pin_result"] = await engine.pin_lora(2)
586586
except Exception as e:
587587
results["pin_error"] = str(e)
588588
timing["pin_end"] = time.perf_counter()
589589

590590
# Run both concurrently
591-
global_start = time.perf_counter()
592591
await asyncio.gather(
593592
inflight_request(),
594-
pin_lora3_after_delay()
593+
pin_cpu_lora_after_delay()
595594
)
596595

597596
# Analyze timing
598597
request_duration = (timing["request_end"] - timing["request_start"]) * 1000
599598
pin_duration = (timing["pin_end"] - timing["pin_start"]) * 1000
600599
pin_started_at = (timing["pin_start"] - timing["request_start"]) * 1000
601600
pin_ended_at = (timing["pin_end"] - timing["request_start"]) * 1000
602-
request_ended_at = (timing["request_end"] - timing["request_start"]) * 1000
603601

604-
# Did pin_lora complete BEFORE request finished?
602+
# Did pin complete before request finished?
605603
pin_during_request = timing["pin_end"] < timing["request_end"]
606604

607605
print(f"\n{'='*60}")
608-
print("PIN_LORA DURING IN-FLIGHT REQUEST - DETAILED TIMING")
606+
print("PIN CPU-RESIDENT LORA DURING IN-FLIGHT REQUEST")
609607
print(f"{'='*60}")
610-
print(f"Scenario: LoRA1 & LoRA2 on GPU, LoRA3 on CPU")
611-
print(f"Action: Start request with LoRA1, then pin LoRA3")
608+
print(f"Config: max_loras=1 (single GPU slot)")
609+
print(f"Setup: LoRA1 on GPU, LoRA2 on CPU")
610+
print(f"Action: Start request with LoRA1, then pin(LoRA2)")
612611
print(f"\nTiming (relative to request start):")
613-
print(f" Request started: 0 ms")
614-
print(f" pin_lora started: {pin_started_at:.0f} ms")
615-
print(f" pin_lora ended: {pin_ended_at:.0f} ms (took {pin_duration:.0f} ms)")
616-
print(f" Request ended: {request_ended_at:.0f} ms (took {request_duration:.0f} ms)")
617-
print(f" Tokens generated: {timing['token_count']}")
612+
print(f" Request started: 0 ms")
613+
print(f" pin_lora started: {pin_started_at:.0f} ms")
614+
print(f" pin_lora ended: {pin_ended_at:.0f} ms (took {pin_duration:.0f} ms)")
615+
print(f" Request ended: {request_duration:.0f} ms")
616+
print(f" Tokens generated: {timing['token_count']}")
618617
print(f"\nResults:")
618+
print(f" Request output: {'SUCCESS' if results['request_output'] else 'FAILED'}")
619+
print(f" Request error: {results['request_error']}")
619620
print(f" pin_lora result: {results['pin_result']}")
620-
print(f" pin_lora error: {results['pin_error']}")
621+
print(f" pin_lora error: {results['pin_error']}")
621622
print(f"\nKEY FINDING:")
622-
if pin_during_request:
623-
print(f" pin_lora COMPLETED DURING in-flight request (non-blocking)")
623+
if results["request_error"]:
624+
print(f" Request was TERMINATED by pin_lora")
625+
elif pin_during_request:
626+
print(f" pin_lora completed DURING request (non-blocking)")
624627
else:
625-
print(f" pin_lora completed AFTER request finished (blocking or sequential)")
628+
print(f" pin_lora WAITED for request to finish")
626629
print(f"{'='*60}")
627630

628-
# Verify the in-flight request completed successfully
629-
assert results["inflight_output"], "In-flight request should produce output"
631+
print("PASS: Documented pin behavior with in-flight request")
630632

631-
print("PASS: Documented pin_lora timing behavior")
633+
"""
634+
KEY FINDING: Request was TERMINATED by pin_lora
635+
636+
The actual error is:
637+
RuntimeError: All items are pinned, cannot remove oldest from the cache.
638+
639+
What happened:
640+
1. LoRA1 is on GPU (the only slot), in-flight request running
641+
2. pin_lora(2) is called → pins LoRA2 (on CPU)
642+
3. Scheduler tries to continue the request but now:
643+
- LoRA2 is pinned (can't be evicted)
644+
- LoRA1 is in use
645+
- Only 1 GPU slot available
646+
- When it tries to swap, it can't evict anything → engine crashes
647+
648+
TODO: This is essentially a vLLM edge case/bug: pinning a CPU-resident LoRA while another LoRA is actively running with max_loras=1 crashes the engine because it creates an unresolvable state.
649+
"""
632650

633651

634652
if __name__ == "__main__":

0 commit comments

Comments
 (0)