build with script

Ylang Tsou · Ylang Tsou · commit 2a7abbbc5a41 · 2025-12-05T12:18:18.000+08:00
Signed-off-by: Ylang Tsou &lt;ylangt@google.com&gt;
diff --git a/.buildkite/pipeline_test_pypi.yml b/.buildkite/pipeline_test_pypi.yml
@@ -2,230 +2,39 @@ steps:
   # -----------------------------------------------------------------
   # TEST STEPS - Calling wrapper
   # -----------------------------------------------------------------
-   - label: "wheel E2E MLPerf tests for JAX models"
-     key: test_0
+   - label: "Performance benchmarks for meta-llama/Llama-3.1-8B-Instruct"
+     key: "meta-llama_Llama-3_1-8B-Instruct_Benchmark"
      soft_fail: true
      agents:
-       queue: tpu_v6e_queue
-     commands:
-       - .buildkite/scripts/run_with_pypi.sh bash /workspace/tpu_inference/tests/e2e/benchmarking/mlperf.sh
-
-   - label: "wheel E2E multi modality test"
-     key: test_5
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - |
-         .buildkite/scripts/run_with_pypi.sh \
-           bash -c 'python3 -m pytest -s -v -x /workspace/tpu_inference/tests/e2e/test_multi_modal_inference.py && \
-            bash /workspace/tpu_inference/tests/e2e/benchmarking/mm_bench.sh'
-
-   - label: "wheel E2E speculative decoding test"
-     key: test_6
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - |
-         .buildkite/scripts/run_with_pypi.sh \
-           bash -c 'python3 -m pytest -s -v -x /workspace/tpu_inference/tests/e2e/test_speculative_decoding.py'
-
-   - label: "wheel JAX unit tests"
-     key: test_7
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - |
-         .buildkite/scripts/run_with_pypi.sh \
-           python3 -m pytest -s -v -x /workspace/tpu_inference/tests/ \
-           --ignore=/workspace/tpu_inference/tests/kernels \
-           --ignore=/workspace/tpu_inference/tests/lora \
-           --ignore=/workspace/tpu_inference/tests/e2e \
-           --ignore=/workspace/tpu_inference/tpu_inference/mock \
-           --cov-config=/workspace/tpu_inference/.coveragerc --cov tpu_inference --cov-report term-missing --cov-fail-under=69
-
-   - label: "wheel JAX unit tests - kernels"
-     key: test_8
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - |
-         .buildkite/scripts/run_with_pypi.sh \
-           python3 -m pytest -s -v -x /workspace/tpu_inference/tests/kernels \
-           --ignore=/workspace/tpu_inference/tests/kernels/ragged_paged_attention_kernel_v2_test.py \
-           --ignore=/workspace/tpu_inference/tests/kernels/ragged_kv_cache_update_v2_test.py \
-           --ignore=/workspace/tpu_inference/tests/kernels/collectives
-
-   - label: "wheel JAX unit tests - collective kernels"
-     key: test_9
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_8_queue
-     commands:
-       - |
-         .buildkite/scripts/run_with_pypi.sh \
-           python3 -m pytest -s -v -x /workspace/tpu_inference/tests/kernels/collectives
-
-   - label: "wheel lora tests for JAX + vLLM models"
-     key: test_10
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - |
-         .buildkite/scripts/run_with_pypi.sh \
-           bash -c 'MODEL_IMPL_TYPE=vllm TPU_BACKEND_TYPE=jax python3 -m pytest -s -v -x /workspace/tpu_inference/tests/lora/test_lora.py'
-
-   - label: "wheel E2E MLperf tests for DeepSeek-R1 (no accuracy, 12-decoder layers only)"
-     key: test_12
-     soft_fail: true
+      queue: tpu_v6e_queue
      env:
-       NEW_MODEL_DESIGN: "True"
-       USE_V6E8_QUEUE: "True"
-       SKIP_ACCURACY_TESTS: "True"
-       VLLM_MLA_DISABLE: "1"
-       JAX_RANDOM_WEIGHTS: "True"
-     agents:
-       queue: tpu_v6e_8_queue
-     commands:
-       - .buildkite/scripts/run_with_pypi.sh bash /workspace/tpu_inference/tests/e2e/benchmarking/mlperf.sh -m deepseek-ai/DeepSeek-R1-0528
-
-
-
-   - label: "wheel TPU Test 0: test_perf.py"
-     key: tpu_test_0
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - .buildkite/scripts/run_with_pypi.sh python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py
-
-   - label: "wheel TPU Test 1: test_compilation.py"
-     key: tpu_test_1
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - .buildkite/scripts/run_with_pypi.sh python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py
-
-   - label: "wheel TPU Test 2: test_basic.py"
-     key: tpu_test_2
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - .buildkite/scripts/run_with_pypi.sh python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py
-
-   - label: "wheel TPU Test 3: test_accuracy.py (v1)"
-     key: tpu_test_3
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - .buildkite/scripts/run_with_pypi.sh python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine
-
-   - label: "wheel TPU Test 4: test_quantization_accuracy.py"
-     key: tpu_test_4
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - .buildkite/scripts/run_with_pypi.sh python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py
-
-   - label: "wheel TPU Test 5: examples/offline_inference/tpu.py"
-     key: tpu_test_5
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - .buildkite/scripts/run_with_pypi.sh python3 /workspace/vllm/examples/offline_inference/tpu.py
-
-   - label: "wheel TPU Test 6: test_tpu_model_runner.py"
-     key: tpu_test_6
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - .buildkite/scripts/run_with_pypi.sh python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py
-
-   - label: "wheel TPU Test 7: test_sampler.py"
-     key: tpu_test_7
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - .buildkite/scripts/run_with_pypi.sh python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py
-
-   - label: "wheel TPU Test 8: test_topk_topp_sampler.py"
-     key: tpu_test_8
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - .buildkite/scripts/run_with_pypi.sh python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py
-
-   - label: "wheel TPU Test 9: test_multimodal.py"
-     key: tpu_test_9
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - .buildkite/scripts/run_with_pypi.sh python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py
-
-   - label: "wheel TPU Test 10: test_pallas.py"
-     key: tpu_test_10
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - .buildkite/scripts/run_with_pypi.sh python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py
-
-   - label: "wheel TPU Test 11: test_struct_output_generate.py"
-     key: tpu_test_11
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - .buildkite/scripts/run_with_pypi.sh python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py
-
-   - label: "wheel TPU Test 12: test_moe_pallas.py"
-     key: tpu_test_12
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-      - .buildkite/scripts/run_with_pypi.sh python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py
-
-   - label: "wheel TPU Test 13: ragged_paged_attention_test.py"
-     key: tpu_test_13
-     soft_fail: true
-     agents:
-       queue: tpu_v6e_queue
-     commands:
-       - .buildkite/scripts/run_with_pypi.sh python3 -m pytest -s -v /workspace/tpu_inference/tests/ragged_paged_attention_test.py
-
-  # -----------------------------------------------------------------
-  # NOTIFICATION STEP
-  # -----------------------------------------------------------------
-   - label: "wheel TPU V1 Test Notification"
-     depends_on:
-       - tpu_test_0
-       - tpu_test_1
-       - tpu_test_2
-       - tpu_test_3
-       - tpu_test_4
-       - tpu_test_5
-       - tpu_test_6
-       - tpu_test_7
-       - tpu_test_8
-       - tpu_test_9
-       - tpu_test_10
-       - tpu_test_11
-       - tpu_test_12
-       - tpu_test_13
-     agents:
-       queue: tpu_v6e_queue
-     commands: "bash .buildkite/scripts/check_results.sh 'TPU V1 Tests Failed' tpu_test_0 tpu_test_1 tpu_test_2 tpu_test_3 tpu_test_4 tpu_test_5 tpu_test_6 tpu_test_7 tpu_test_8 tpu_test_9 tpu_test_10 tpu_test_11 tpu_test_12 tpu_test_13"
+      TEST_MODEL: meta-llama/Llama-3.1-8B-Instruct
+      TENSOR_PARALLEL_SIZE: 1
+      MINIMUM_THROUGHPUT_THRESHOLD: 10.77
+      INPUT_LEN: 1800
+      OUTPUT_LEN: 128
+      PREFIX_LEN: 0
+      MAX_MODEL_LEN: 2048
+      MAX_NUM_SEQS: 256
+      MAX_NUM_BATCHED_TOKENS: 1024
+     commands:
+      - |
+        /usr/local/lib/python3.12/site-packages/.buildkite/scripts/run_with_pypi.sh bash /usr/local/lib/python3.12/site-packages/tpu_inference/tests/e2e/benchmarking/benchmark.sh
+
+   - label: "Performance benchmarks for Qwen/Qwen3-4B"
+     key: "Qwen_Qwen3-4B_Benchmark"
+     agents:
+      queue: tpu_v6e_queue
+     env:
+      TEST_MODEL: Qwen/Qwen3-4B
+      TENSOR_PARALLEL_SIZE: 1
+      MINIMUM_THROUGHPUT_THRESHOLD: 11.00
+      INPUT_LEN: 1800
+      OUTPUT_LEN: 128
+      PREFIX_LEN: 0
+      MAX_MODEL_LEN: 2048
+      MAX_NUM_SEQS: 94
+      MAX_NUM_BATCHED_TOKENS: 4096
+     commands:
+      - |
+       /usr/local/lib/python3.12/site-packages/.buildkite/scripts/run_with_pypi.sh bash /usr/local/lib/python3.12/site-packages/tpu_inference/tests/e2e/benchmarking/benchmark.sh
diff --git a/.buildkite/scripts/build_vllm_tpu.sh b/.buildkite/scripts/build_vllm_tpu.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+
+set -e
+
+# --- Script Configuration ---
+TPU_INFERENCE_VERSION=$1
+VLLM_TPU_VERSION=$2
+VLLM_BRANCH=${3:-"main"}
+VLLM_REPO="https://github.com/vllm-project/vllm.git"
+REPO_DIR="vllm"
+
+# --- Argument Validation ---
+if [ "$#" -lt 2 ]; then
+    echo "Usage: $0 <tpu-inference-version> <vllm-tpu-version> [vllm-branch-or-tag]"
+    echo "  [vllm-branch-or-tag] is optional, defaults to 'main'."
+    exit 1
+fi
+
+echo "--- Starting vLLM-TPU wheel build ---"
+echo "TPU Inference Version: ${TPU_INFERENCE_VERSION}"
+echo "vLLM-TPU Version: ${VLLM_TPU_VERSION}"
+echo "vLLM Branch/Tag: ${VLLM_BRANCH}"
+
+# --- Step 1: Clone vLLM repository ---
+if [ -d "$REPO_DIR" ]; then
+    echo "Repository '$REPO_DIR' already exists. Skipping clone."
+else
+    echo "Cloning vLLM repository..."
+    git clone ${VLLM_REPO}
+fi
+cd ${REPO_DIR}
+
+# --- Step 1.5: Checkout the specified vLLM branch/tag ---
+echo "Checking out vLLM branch/tag: ${VLLM_BRANCH}..."
+if ! git checkout "${VLLM_BRANCH}"; then
+    echo "ERROR: Failed to checkout branch/tag '${VLLM_BRANCH}'. Please check the branch/tag name."
+    exit 1
+fi
+echo "Successfully checked out ${VLLM_BRANCH}."
+git pull || echo "Warning: Failed to pull updates (may be on a tag)."
+
+# --- Step 2: Update tpu-inference version in requirements ---
+REQUIRED_LINE="tpu-inference==${TPU_INFERENCE_VERSION}"
+REQUIREMENTS_FILE="requirements/tpu.txt"
+BACKUP_FILE="${REQUIREMENTS_FILE}.bak"
+
+echo "Updating tpu-inference version in $REQUIREMENTS_FILE..."
+
+if [ -f "$REQUIREMENTS_FILE" ]; then
+    # Check if the last character is NOT a newline. If not, append one.
+    if [ "$(tail -c 1 "$REQUIREMENTS_FILE")" != "" ]; then
+        echo "" >> "$REQUIREMENTS_FILE"
+        echo "(Action: Added missing newline to the end of $REQUIREMENTS_FILE for safety.)"
+    fi
+fi
+
+if grep -q "^tpu-inference==" "$REQUIREMENTS_FILE"; then
+    # Replace the existing version using sed, which creates the .bak file
+    echo "(Action: Existing version found. Replacing.)"
+    sed -i.bak "s/^tpu-inference==.*/$REQUIRED_LINE/" "$REQUIREMENTS_FILE"
+
+else
+    # Line not found -> Append the new line to the file end, and manually create .bak
+    echo "(Action: Line not found. Appending new dependency.)"
+    echo "$REQUIRED_LINE" >> "$REQUIREMENTS_FILE"
+
+    # Create an empty .bak file for consistency, so cleanup works later.
+    touch "$BACKUP_FILE"
+fi
+
+# --- Step 3: Execute the vLLM TPU build script ---
+echo "Ensuring 'build' package is installed..."
+pip install build
+echo "Executing the vLLM TPU build script..."
+bash tools/vllm-tpu/build.sh "${VLLM_TPU_VERSION}"
+
+echo "--- Build complete! ---"
+echo "The wheel file can be found in the 'vllm/dist' directory."
+
+# --- Step 4: Cleanup and Revert Requirements File ---
+echo "--- Cleaning up local changes ---"
+
+if [ -f "$BACKUP_FILE" ]; then
+    echo "Reverting $REQUIREMENTS_FILE from backup."
+    # Remove the modified file
+    rm -f "$REQUIREMENTS_FILE"
+    # Rename the backup file back to the original name
+    mv "$BACKUP_FILE" "$REQUIREMENTS_FILE"
+else
+    echo "Warning: Backup file $BACKUP_FILE not found. Skipping revert."
+fi
+
+echo "Cleanup complete. Script finished."
diff --git a/docker/Dockerfile.pypi b/docker/Dockerfile.pypi
@@ -18,27 +18,25 @@ RUN export TPU_INFERENCE_VERSION=$(pip index versions tpu-inference --pre 2>/dev
     echo -n "${TPU_INFERENCE_VERSION}" > /tmp/tpu_inference_version
 
 # Clone vLLM
+WORKDIR /workspace
+RUN export VLLM_TPU_VERSION=$(cat /tmp/tpu_inference_version) && \
+    bash tpu_inference/.buildkite/scripts/build_vllm_tpu.sh ${VLLM_TPU_VERSION} ${VLLM_TPU_VERSION}
+
+# Install vllm-tpu from whl
 WORKDIR /workspace/vllm
-ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-RUN git clone $VLLM_REPO /workspace/vllm
-RUN pip install build
-RUN export TPU_INFERENCE_VERSION=$(cat /tmp/tpu_inference_version) && \
-    sed -i "s/^tpu-inference==.*/tpu-inference==${TPU_INFERENCE_VERSION}/" requirements/tpu.txt && \
-    bash tools/vllm-tpu/build.sh ${TPU_INFERENCE_VERSION}
-
-# Install vllm-tpu from wheel
 RUN pip install --no-cache-dir dist/*.whl
 
 # Install test dependencies
-RUN python3 -m pip install -e tests/vllm_test_utils
+RUN python3 -m pip install tests/vllm_test_utils
 RUN python3 -m pip install --no-cache-dir \
     git+https://github.com/thuml/depyf.git \
     pytest-asyncio \
     git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] \
     pytest-cov \
     tblib
 
-# Set environment variable to use site-packages
-ENV FORCE_USE_SITE_PACKAGES=1
+# Remove repository
+WORKDIR /workspace
+RUN rm -rf /workspace/vllm /workspace/tpu_inference
 
 CMD ["/bin/bash"]
diff --git a/tests/conftest.py b/tests/conftest.py