From 59f9c4f9e6b187b3b44d2a615dfd7cdbf4b3b072 Mon Sep 17 00:00:00 2001 From: jcgu Date: Wed, 1 Jul 2026 16:16:47 -0700 Subject: [PATCH] Update KV host-offloading example to use RaidenOffloadConnector from tpu-inference. PiperOrigin-RevId: 941349331 --- examples/kv_host_offloading/README.md | 14 ++- examples/kv_host_offloading/setup.sh | 139 ++++++++++++++++++++++++++ 2 files changed, 148 insertions(+), 5 deletions(-) create mode 100644 examples/kv_host_offloading/setup.sh diff --git a/examples/kv_host_offloading/README.md b/examples/kv_host_offloading/README.md index 63be1e8..6a313db 100644 --- a/examples/kv_host_offloading/README.md +++ b/examples/kv_host_offloading/README.md @@ -32,8 +32,8 @@ host block pool are provided by tpu-raiden. - The venv must have **`tpu_raiden`**, **`vllm`**, and **`tpu_inference`** importable (see Step 1). These scripts do **not** activate any environment — activate your venv yourself before running them. -- The benchmark (`benchmark.sh`) uses tpu-inference's `benchmark_serving.py`, which imports - **`evaluate`** and **`nltk`** — `pip install evaluate nltk` if missing. +- The benchmark (`benchmark.sh`) uses tpu-inference's `benchmark_serving.py`, which + imports **`evaluate`** and **`nltk`** (installed by `setup.sh`). - Hugging Face access for the model (e.g. `export HF_TOKEN=...`) if it is gated. --- @@ -55,13 +55,17 @@ In an **activated** python3.12 venv: The run scripts pick up whichever install you did (see `raiden_env.sh`). 2. Install **vllm** + **tpu_inference** (the offload connector ships in - tpu_inference). If you don't already have them, the sibling disagg example's - installer fetches compatible, pinned versions: + tpu_inference), plus the benchmark client's `evaluate`/`nltk` deps, via this + example's own installer: ```bash - bash ../single_host_disagg/setup.sh + bash setup.sh ``` + `setup.sh` clones vLLM + tpu-inference at pinned commits (which include the + RaidenOffloadConnector) into a hidden in-tree `.src/` and installs them editable. + Override `VLLM_COMMIT` / `TPU_INFERENCE_COMMIT` to use different versions. + ## Step 2 — Run the demo ```bash diff --git a/examples/kv_host_offloading/setup.sh b/examples/kv_host_offloading/setup.sh new file mode 100644 index 0000000..c65c2db --- /dev/null +++ b/examples/kv_host_offloading/setup.sh @@ -0,0 +1,139 @@ +#!/bin/bash + +# Copyright 2026 Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright 2026 Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# One-click environment setup for the KV host-offloading example. +# +# Clones vLLM and tpu-inference (which ships the RaidenOffloadConnector) into a +# hidden, in-tree `.src/` dir and installs them editable into the CURRENTLY +# ACTIVE venv. Self-contained: does not depend on any other example's setup. +# +# Prerequisites: +# 1. A python3.12 venv created and ACTIVATED. +# 2. tpu_raiden already installed into that venv via EITHER supported path: +# - build from source: `./build.sh jax` from the repo root, or +# - wheel: `pip install tpu-raiden-jax --extra-index-url ` +# Then, from this directory: +# bash setup.sh +set -euo pipefail + +SCRIPTS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +RAIDEN_ROOT="${RAIDEN_ROOT:-$(cd "${SCRIPTS_DIR}/../.." && pwd)}" +SRC_DIR="${SRC_DIR:-${SCRIPTS_DIR}/.src}" + +# Pinned sources. The RaidenOffloadConnector is on tpu-inference main; these +# pinned commits include it. vLLM's pin is coupled with tpu-inference (see +# tpu-inference/.buildkite/vllm_lkg.version), so keep the two in lockstep if you +# bump either. +VLLM_REPO="${VLLM_REPO:-https://github.com/vllm-project/vllm.git}" +VLLM_COMMIT="${VLLM_COMMIT:-c8d2f3cb1485fcca725653fb92a445b6cc10ade7}" +TPU_INFERENCE_REPO="${TPU_INFERENCE_REPO:-https://github.com/vllm-project/tpu-inference.git}" +TPU_INFERENCE_COMMIT="${TPU_INFERENCE_COMMIT:-34f3afe929d95e7b7dca83571d83edee2306dd51}" + +OFFLOAD_CONNECTOR_REL="tpu_inference/offload/raiden_offload_connector.py" + +# --- Preconditions -------------------------------------------------------- +if [ -z "${VIRTUAL_ENV:-}" ] && ! python -c "import sys; sys.exit(0 if sys.prefix != sys.base_prefix else 1)" 2>/dev/null; then + echo "ERROR: no Python venv appears to be active." >&2 + echo "Create and activate the venv per the repo README, then re-run." >&2 + exit 1 +fi + +# tpu_raiden must already be available via EITHER supported path -- this script +# does NOT install it. The run scripts resolve whichever is present at launch +# (see raiden_env.sh). +if ! python -c "import tpu_raiden.rpc.coordination_pb2" 2>/dev/null \ + && [ ! -f "${RAIDEN_ROOT}/tpu_raiden/frameworks/jax/_tpu_raiden_jax.so" ]; then + echo "ERROR: tpu_raiden is not available. Install it via one of:" >&2 + echo " - build from source: run \`./build.sh jax\` from the repo root, or" >&2 + echo " - wheel: \`pip install tpu-raiden-jax --extra-index-url \`" >&2 + exit 1 +fi + +echo "venv: ${VIRTUAL_ENV:-$(python -c 'import sys; print(sys.prefix)')}" +echo "raiden root: ${RAIDEN_ROOT}" +echo "src dir: ${SRC_DIR}" +echo "tpu-inference: ${TPU_INFERENCE_REPO} @ ${TPU_INFERENCE_COMMIT}" +mkdir -p "${SRC_DIR}" + +# --- Clone (or update) at the pinned ref ---------------------------------- +clone_pinned() { + local name="$1" repo="$2" ref="$3" dest="${SRC_DIR}/$1" + if [ -d "${dest}/.git" ]; then + echo "=== ${name}: existing checkout, fetching ${ref} ===" + git -C "${dest}" fetch --quiet origin "${ref}" 2>/dev/null || git -C "${dest}" fetch --quiet origin + else + echo "=== ${name}: cloning ${repo} ===" + git clone "${repo}" "${dest}" + git -C "${dest}" fetch --quiet origin "${ref}" 2>/dev/null || true + fi + # Works for a commit SHA, a tag, or a branch (via FETCH_HEAD when fetched). + git -C "${dest}" checkout --quiet "${ref}" 2>/dev/null \ + || git -C "${dest}" checkout --quiet FETCH_HEAD + echo "${name} @ $(git -C "${dest}" rev-parse --short HEAD)" +} + +clone_pinned vllm "${VLLM_REPO}" "${VLLM_COMMIT}" +clone_pinned tpu-inference "${TPU_INFERENCE_REPO}" "${TPU_INFERENCE_COMMIT}" + +# --- Verify the offload connector is present in this tpu-inference ref ----- +if [ ! -f "${SRC_DIR}/tpu-inference/${OFFLOAD_CONNECTOR_REL}" ]; then + echo "ERROR: ${OFFLOAD_CONNECTOR_REL} is missing from the tpu-inference" >&2 + echo "checkout at ref '${TPU_INFERENCE_COMMIT}'. Use a ref that contains the" >&2 + echo "RaidenOffloadConnector (the pinned default does), e.g.:" >&2 + echo " TPU_INFERENCE_COMMIT= bash setup.sh" >&2 + exit 1 +fi + +# --- Install into the active venv (vllm first; tpu-inference depends on it) - +echo "=== Installing vLLM (TPU target, editable) ===" +pip install -r "${SRC_DIR}/vllm/requirements/tpu.txt" +# Default PyPI serves a CUDA torch, whose Torch CMake config makes vLLM's +# find_package(Torch) demand CUDA libs (build fails). Pin the matching CPU build +# (paired with torchvision==0.25.0 from tpu-inference's requirements). +pip install --index-url https://download.pytorch.org/whl/cpu torch==2.10.0+cpu torchvision==0.25.0+cpu +# --no-build-isolation: build against the venv's CPU torch. Without it, pip +# provisions a fresh CUDA torch in an isolated overlay and CMake fails on CUDA. +VLLM_TARGET_DEVICE="tpu" pip install -e "${SRC_DIR}/vllm" --no-build-isolation + +echo "=== Installing tpu-inference (editable) ===" +pip install -r "${SRC_DIR}/tpu-inference/requirements.txt" +pip install -e "${SRC_DIR}/tpu-inference" + +# The benchmark client (benchmark.sh) uses tpu-inference's benchmark_serving.py, +# which imports these two eval-only helpers. +echo "=== Installing benchmark client deps (evaluate, nltk) ===" +pip install evaluate nltk + +echo "" +echo "=== Setup complete! ===" +echo "vllm + tpu_inference are installed in the active venv; tpu_raiden is" +echo "resolved at run time by raiden_env.sh (site-packages if wheel-installed, or" +echo "the source tree via PYTHONPATH if built from source)." +echo "Next: run the offloading demo with" +echo " bash ${SCRIPTS_DIR}/run_offload.sh"