clear xla compilation cache before each disagg server launch (#1271)

sixiang-google · web-flow · commit d80872a505eb · 2025-12-09T20:06:05.000-08:00
diff --git a/tests/e2e/test_local_disagg.py b/tests/e2e/test_local_disagg.py
@@ -7,6 +7,7 @@
 from unittest.mock import patch
 
 import pytest
+import vllm.envs as vllm_envs
 from vllm import LLM, EngineArgs, SamplingParams
 
 from tpu_inference.core.core_tpu import DisaggEngineCore, DisaggEngineCoreProc
@@ -94,7 +95,7 @@ def test_disaggregated_serving(test_prompts, sampling_params):
              patch("vllm.v1.engine.core.EngineCoreProc", DisaggEngineCoreProc):
 
             model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
-
+            os.system(f"rm -rf {vllm_envs.VLLM_XLA_CACHE_PATH}/*")
             engine_args = EngineArgs(
                 model=model_name,
                 max_model_len=2048,
@@ -194,7 +195,9 @@ def test_disaggregated_serving_correctness(test_prompts, sampling_params):
                                       is_disagg=False)
 
     # Run disaggregated inference
+    os.system(f"rm -rf {vllm_envs.VLLM_XLA_CACHE_PATH}/*")
     print("Running Disaggregated Inference...")
+
     disagg_outputs = _run_inference(model_name=model_name,
                                     test_prompts=small_prompts,
                                     sampling_params=sampling_params,