[Misc] Improve Lora edge cases test coverage (#1830)

Jeffwan · web-flow · commit c3e63f9b517a · 2025-12-06T20:30:27.000+08:00
Signed-off-by: Jiaxin Shan &lt;seedjeffwan@gmail.com&gt;
diff --git a/test/integration/engine/lora/README.md b/test/integration/engine/lora/README.md
@@ -7,10 +7,22 @@ This test suite validates vLLM's LoRA adapter memory management, including LRU e
 - Python 3.10+
 - vLLM installed with LoRA support
 - GPU with sufficient memory (recommended: 24GB+ for Qwen3-8B)
-- Required packages: `pytest`, `pytest-asyncio`, `torch`, `safetensors`, `transformers`, `vllm`
+- Required packages: `pytest`, `pytest-asyncio`
+- Install vLLM.
 
 ```bash
-pip install pytest pytest-asyncio torch safetensors transformers vllm
+# 1. create env
+python3 -m venv ~/venvs/vllm
+source ~/venvs/vllm/bin/activate
+
+# 2. Upgrade pip
+pip install --upgrade pip
+
+# 3. Install vLLM (and whatever else you need)
+pip install "vllm[torch]"==0.12.0
+
+# 4. Install test packages
+pip install pytest pytest-asyncio
 ```
 
 ## Quick Start
diff --git a/test/integration/engine/lora/test_api.py b/test/integration/engine/lora/test_api.py
@@ -117,7 +117,7 @@ def test_reload_same_lora(self, llm):
         # Second load (same LoRA) - should be idempotent
         result2 = llm.llm_engine.add_lora(lora_req)
 
-        # Document behavior
+        # Document behavior, both should be True.
         print(f"\nFirst add_lora: {result1}")
         print(f"Second add_lora (duplicate): {result2}")
 
@@ -196,6 +196,38 @@ def test_pin_lora_not_found_raises(self, llm):
             f"Expected error about LoRA not found, got: {excinfo.value}"
         print("PASS: pin_lora raises exception when LoRA not found")
 
+    def test_remove_pinned_lora(self, llm):
+        """Test that pinned LoRAs can still be explicitly removed.
+
+        Note: pin_lora() only prevents eviction due to LRU cache pressure,
+        it does NOT prevent explicit remove_lora() calls.
+        """
+        # Add and pin a LoRA
+        lora_req = LoRARequest("pinned_lora", 1, get_lora_path(0))
+        llm.llm_engine.add_lora(lora_req)
+        llm.llm_engine.pin_lora(1)
+
+        # Verify it's loaded
+        loras_before = set(llm.llm_engine.list_loras())
+        assert 1 in loras_before, "LoRA should be loaded"
+
+        # Remove the pinned LoRA - this should succeed
+        remove_result = llm.llm_engine.remove_lora(1)
+        assert remove_result is True, "remove_lora on pinned LoRA should return True"
+
+        # Verify it was removed
+        loras_after = set(llm.llm_engine.list_loras())
+        assert 1 not in loras_after, "Pinned LoRA should be removed after remove_lora()"
+
+        print(f"\n{'='*60}")
+        print("REMOVE PINNED LORA")
+        print(f"{'='*60}")
+        print("pin_lora() prevents LRU eviction, NOT explicit remove_lora()")
+        print(f"remove_lora(1) on pinned LoRA: {remove_result}")
+        print(f"{'='*60}")
+
+        print("PASS: Pinned LoRA can be explicitly removed")
+
 
 if __name__ == "__main__":
     pytest.main([__file__, "-v", "-s"])
diff --git a/test/integration/engine/lora/test_batching_concurrency.py b/test/integration/engine/lora/test_batching_concurrency.py
@@ -493,18 +493,18 @@ class TestConcurrentOperations:
     """
 
     @pytest_asyncio.fixture
-    async def async_engine_edge_cases(self):
-        """Create AsyncLLMEngine for edge case tests.
+    async def async_engine_single_gpu_slot(self):
+        """Create AsyncLLMEngine with max_loras=1 (single GPU slot).
 
-        max_loras=2 (GPU slots), max_cpu_loras=8 (CPU cache)
-        This means only 2 LoRAs can be on GPU at a time.
+        max_loras=1 (only 1 LoRA on GPU), max_cpu_loras=8 (CPU cache)
+        This forces LoRA swapping when switching between LoRAs.
         """
         cleanup_gpu()
         engine_args = AsyncEngineArgs(
             model=BASE_MODEL,
             enable_lora=True,
-            max_loras=2,  # Only 2 LoRAs on GPU at a time
-            max_cpu_loras=8,  # 8 LoRAs can be cached on CPU
+            max_loras=1,  # Only 1 LoRA on GPU at a time
+            max_cpu_loras=8,  # LoRAs can be cached on CPU
             max_lora_rank=16,
             gpu_memory_utilization=0.8,
             trust_remote_code=True,
@@ -520,115 +520,133 @@ async def async_engine_edge_cases(self):
         cleanup_gpu()
 
     @pytest.mark.asyncio
-    async def test_pin_lora_during_inflight_request(self, async_engine_edge_cases):
-        """Test pinning a CPU-resident LoRA while another LoRA has in-flight requests.
-
-        Scenario:
-        1. Load LoRA1 and LoRA2 (both on GPU, max_loras=2)
-        2. Load LoRA3 (goes to CPU since GPU is full)
+    async def test_pin_cpu_lora_during_inflight_request(self, async_engine_single_gpu_slot):
+        """Test pinning a CPU-resident LoRA while GPU LoRA has in-flight request.
+        note: we only test the case that all gpu loras are busy with in-flight requests,
+        to make test simple, we use --max-loras=1
+
+        Scenario (max_loras=1):
+        1. Load LoRA1 (goes to GPU - the only slot)
+        2. Load LoRA2 (goes to CPU since GPU is full)
         3. Start an in-flight request using LoRA1
-        4. While request is running, try to pin LoRA3 (on CPU)
-        5. Document what happens: Does it block? Fail? Succeed?
+        4. While request is running, pin LoRA2 (on CPU)
 
-        Key question: Does pin_lora execute DURING the request or wait until AFTER?
+        Key question: Does pin(lora2) wait for request to finish,
+        or does it preempt/terminate the in-flight request?
         """
-        engine = async_engine_edge_cases
+        engine = async_engine_single_gpu_slot
 
-        # Load 3 LoRAs - first 2 go to GPU, 3rd goes to CPU
-        for i in range(3):
-            lora_req = LoRARequest(f"lora_{i}", i + 1, get_lora_path(i))
-            await engine.add_lora(lora_req)
+        # Load LoRA1 (goes to GPU) and LoRA2 (goes to CPU)
+        lora1 = LoRARequest("lora_0", 1, get_lora_path(0))
+        lora2 = LoRARequest("lora_1", 2, get_lora_path(1))
+        await engine.add_lora(lora1)
+        await engine.add_lora(lora2)
 
-        sampling_params = SamplingParams(max_tokens=100)  # Longer to ensure overlap
+        sampling_params = SamplingParams(max_tokens=100)  # Long enough for overlap
 
-        # Track detailed timing
         timing = {
             "request_start": None,
-            "request_first_token": None,
             "request_end": None,
             "pin_start": None,
             "pin_end": None,
             "token_count": 0,
         }
         results = {
-            "inflight_output": None,
+            "request_output": None,
+            "request_error": None,
             "pin_result": None,
             "pin_error": None,
         }
 
         async def inflight_request():
-            """Run an in-flight request using LoRA1, tracking token timing."""
+            """Run request using LoRA1 (on GPU)."""
             timing["request_start"] = time.perf_counter()
-            lora_req = LoRARequest("lora_0", 1, get_lora_path(0))
-            output_text = ""
-            async for output in engine.generate(
-                "Write a detailed story about a dragon and a knight: ",
-                sampling_params,
-                request_id="inflight_lora1",
-                lora_request=lora_req
-            ):
-                if timing["request_first_token"] is None:
-                    timing["request_first_token"] = time.perf_counter()
-                timing["token_count"] = len(output.outputs[0].token_ids)
-                output_text = output.outputs[0].text
+            try:
+                output_text = ""
+                async for output in engine.generate(
+                    "Write a story about a wizard: ",
+                    sampling_params,
+                    request_id="inflight_lora1",
+                    lora_request=LoRARequest("lora_0", 1, get_lora_path(0))
+                ):
+                    timing["token_count"] = len(output.outputs[0].token_ids)
+                    output_text = output.outputs[0].text
+                results["request_output"] = output_text
+            except Exception as e:
+                results["request_error"] = str(e)
             timing["request_end"] = time.perf_counter()
-            results["inflight_output"] = output_text
-            return output_text
 
-        async def pin_lora3_after_delay():
-            """Wait briefly then try to pin LoRA3 (on CPU)."""
-            await asyncio.sleep(0.2)  # Wait for request to start generating
+        async def pin_cpu_lora_after_delay():
+            """Wait briefly then pin LoRA2 (on CPU)."""
+            await asyncio.sleep(0.2)  # Wait for request to start
 
             timing["pin_start"] = time.perf_counter()
             try:
-                pin_result = await engine.pin_lora(3)
-                results["pin_result"] = pin_result
+                results["pin_result"] = await engine.pin_lora(2)
             except Exception as e:
                 results["pin_error"] = str(e)
             timing["pin_end"] = time.perf_counter()
 
         # Run both concurrently
-        global_start = time.perf_counter()
         await asyncio.gather(
             inflight_request(),
-            pin_lora3_after_delay()
+            pin_cpu_lora_after_delay()
         )
 
         # Analyze timing
         request_duration = (timing["request_end"] - timing["request_start"]) * 1000
         pin_duration = (timing["pin_end"] - timing["pin_start"]) * 1000
         pin_started_at = (timing["pin_start"] - timing["request_start"]) * 1000
         pin_ended_at = (timing["pin_end"] - timing["request_start"]) * 1000
-        request_ended_at = (timing["request_end"] - timing["request_start"]) * 1000
 
-        # Did pin_lora complete BEFORE request finished?
+        # Did pin complete before request finished?
         pin_during_request = timing["pin_end"] < timing["request_end"]
 
         print(f"\n{'='*60}")
-        print("PIN_LORA DURING IN-FLIGHT REQUEST - DETAILED TIMING")
+        print("PIN CPU-RESIDENT LORA DURING IN-FLIGHT REQUEST")
         print(f"{'='*60}")
-        print(f"Scenario: LoRA1 & LoRA2 on GPU, LoRA3 on CPU")
-        print(f"Action: Start request with LoRA1, then pin LoRA3")
+        print(f"Config: max_loras=1 (single GPU slot)")
+        print(f"Setup: LoRA1 on GPU, LoRA2 on CPU")
+        print(f"Action: Start request with LoRA1, then pin(LoRA2)")
         print(f"\nTiming (relative to request start):")
-        print(f"  Request started:      0 ms")
-        print(f"  pin_lora started:     {pin_started_at:.0f} ms")
-        print(f"  pin_lora ended:       {pin_ended_at:.0f} ms (took {pin_duration:.0f} ms)")
-        print(f"  Request ended:        {request_ended_at:.0f} ms (took {request_duration:.0f} ms)")
-        print(f"  Tokens generated:     {timing['token_count']}")
+        print(f"  Request started:  0 ms")
+        print(f"  pin_lora started: {pin_started_at:.0f} ms")
+        print(f"  pin_lora ended:   {pin_ended_at:.0f} ms (took {pin_duration:.0f} ms)")
+        print(f"  Request ended:    {request_duration:.0f} ms")
+        print(f"  Tokens generated: {timing['token_count']}")
         print(f"\nResults:")
+        print(f"  Request output: {'SUCCESS' if results['request_output'] else 'FAILED'}")
+        print(f"  Request error:  {results['request_error']}")
         print(f"  pin_lora result: {results['pin_result']}")
-        print(f"  pin_lora error: {results['pin_error']}")
+        print(f"  pin_lora error:  {results['pin_error']}")
         print(f"\nKEY FINDING:")
-        if pin_during_request:
-            print(f"  pin_lora COMPLETED DURING in-flight request (non-blocking)")
+        if results["request_error"]:
+            print(f"  Request was TERMINATED by pin_lora")
+        elif pin_during_request:
+            print(f"  pin_lora completed DURING request (non-blocking)")
         else:
-            print(f"  pin_lora completed AFTER request finished (blocking or sequential)")
+            print(f"  pin_lora WAITED for request to finish")
         print(f"{'='*60}")
 
-        # Verify the in-flight request completed successfully
-        assert results["inflight_output"], "In-flight request should produce output"
+        print("PASS: Documented pin behavior with in-flight request")
 
-        print("PASS: Documented pin_lora timing behavior")
+        """
+        KEY FINDING: Request was TERMINATED by pin_lora
+
+        The actual error is:
+        RuntimeError: All items are pinned, cannot remove oldest from the cache.
+    
+        What happened:
+        1. LoRA1 is on GPU (the only slot), in-flight request running
+        2. pin_lora(2) is called → pins LoRA2 (on CPU)
+        3. Scheduler tries to continue the request but now:
+          - LoRA2 is pinned (can't be evicted)
+          - LoRA1 is in use
+          - Only 1 GPU slot available
+          - When it tries to swap, it can't evict anything → engine crashes
+
+        TODO: This is essentially a vLLM edge case/bug: pinning a CPU-resident LoRA while another LoRA is actively running with max_loras=1 crashes the engine because it creates an unresolvable state.
+        """
 
 
 if __name__ == "__main__":
diff --git a/test/integration/engine/lora/test_memory.py b/test/integration/engine/lora/test_memory.py