Skip to content

Commit 4ab8c9b

Browse files
bbartelsdevpatelio
authored andcommitted
[CI/build] Removes source compilation from runtime image (vllm-project#26966)
Signed-off-by: bbartels <benjamin@bartels.dev>
1 parent ea0cb49 commit 4ab8c9b

File tree

4 files changed

+159
-115
lines changed

4 files changed

+159
-115
lines changed

docker/Dockerfile

Lines changed: 43 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ ARG GET_PIP_URL
8585
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
8686
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
8787
&& apt-get update -y \
88-
&& apt-get install -y ccache software-properties-common git curl sudo python3-pip \
88+
&& apt-get install -y ccache software-properties-common git curl sudo python3-pip libibverbs-dev \
8989
&& curl -LsSf https://astral.sh/uv/install.sh | sh \
9090
&& $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
9191
&& rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
@@ -224,6 +224,22 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
224224
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
225225
fi
226226

227+
# Install DeepGEMM from source
228+
ARG DEEPGEMM_GIT_REF
229+
COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
230+
RUN --mount=type=cache,target=/root/.cache/uv \
231+
VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} --wheel-dir /tmp/deepgemm/dist
232+
233+
# Ensure the wheel dir exists so later-stage COPY won't fail when DeepGEMM is skipped
234+
RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
235+
236+
COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
237+
# Install EP kernels(pplx-kernels and DeepEP)
238+
RUN --mount=type=cache,target=/root/.cache/uv \
239+
export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \
240+
/tmp/install_python_libraries.sh /tmp/ep_kernels_workspace wheel && \
241+
find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
242+
227243
# Check the size of the wheel if RUN_WHEEL_CHECK is true
228244
COPY .buildkite/check-wheel-size.py check-wheel-size.py
229245
# sync the default value with .buildkite/check-wheel-size.py
@@ -289,7 +305,7 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
289305
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
290306
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
291307
&& apt-get update -y \
292-
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
308+
&& apt-get install -y software-properties-common curl sudo python3-pip \
293309
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
294310
&& if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
295311
if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
@@ -356,36 +372,32 @@ RUN --mount=type=cache,target=/root/.cache/uv \
356372
. /etc/environment && \
357373
uv pip list
358374

359-
# Even when we build Flashinfer with AOT mode, there's still
360-
# some issues w.r.t. JIT compilation. Therefore we need to
361-
# install build dependencies for JIT compilation.
362-
# TODO: Remove this once FlashInfer AOT wheel is fixed
363-
COPY requirements/build.txt requirements/build.txt
375+
# Install deepgemm wheel that has been built in the `build` stage
364376
RUN --mount=type=cache,target=/root/.cache/uv \
365-
uv pip install --system -r requirements/build.txt \
377+
--mount=type=bind,from=build,source=/tmp/deepgemm/dist,target=/tmp/deepgemm/dist,ro \
378+
sh -c 'if ls /tmp/deepgemm/dist/*.whl >/dev/null 2>&1; then \
379+
uv pip install --system /tmp/deepgemm/dist/*.whl; \
380+
else \
381+
echo "No DeepGEMM wheels to install; skipping."; \
382+
fi'
383+
384+
# Pytorch now installs NVSHMEM, setting LD_LIBRARY_PATH (https://github.com/pytorch/pytorch/blob/d38164a545b4a4e4e0cf73ce67173f70574890b6/.ci/manywheel/build_cuda.sh#L141C14-L141C36)
385+
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
386+
387+
# Install EP kernels wheels (pplx-kernels and DeepEP) that have been built in the `build` stage
388+
RUN --mount=type=bind,from=build,src=/tmp/ep_kernels_workspace/dist,target=/vllm-workspace/ep_kernels/dist \
389+
--mount=type=cache,target=/root/.cache/uv \
390+
uv pip install --system ep_kernels/dist/*.whl --verbose \
366391
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
367392

368-
# Install DeepGEMM from source
369-
ARG DEEPGEMM_GIT_REF
370-
COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
371-
RUN --mount=type=cache,target=/root/.cache/uv \
372-
VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"}
373-
374-
COPY tools/install_gdrcopy.sh install_gdrcopy.sh
375-
RUN set -eux; \
393+
RUN --mount=type=bind,source=tools/install_gdrcopy.sh,target=/tmp/install_gdrcopy.sh,ro \
394+
set -eux; \
376395
case "${TARGETPLATFORM}" in \
377396
linux/arm64) UUARCH="aarch64" ;; \
378397
linux/amd64) UUARCH="x64" ;; \
379398
*) echo "Unsupported TARGETPLATFORM: ${TARGETPLATFORM}" >&2; exit 1 ;; \
380399
esac; \
381-
./install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}"; \
382-
rm ./install_gdrcopy.sh
383-
384-
# Install EP kernels(pplx-kernels and DeepEP)
385-
COPY tools/ep_kernels/install_python_libraries.sh install_python_libraries.sh
386-
ENV CUDA_HOME=/usr/local/cuda
387-
RUN export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0a 10.0a+PTX}" \
388-
&& bash install_python_libraries.sh
400+
/tmp/install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}"
389401

390402
# CUDA image changed from /usr/local/nvidia to /usr/local/cuda in 12.8 but will
391403
# return to /usr/local/nvidia in 13.0 to allow container providers to mount drivers
@@ -415,6 +427,11 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
415427
# Use copy mode to avoid hardlink failures with Docker cache mounts
416428
ENV UV_LINK_MODE=copy
417429

430+
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
431+
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
432+
&& apt-get update -y \
433+
&& apt-get install -y git
434+
418435
# install development dependencies (for testing)
419436
RUN --mount=type=cache,target=/root/.cache/uv \
420437
CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
@@ -455,12 +472,11 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
455472
# Reference: https://github.com/astral-sh/uv/pull/1694
456473
ENV UV_HTTP_TIMEOUT=500
457474

458-
COPY requirements/kv_connectors.txt requirements/kv_connectors.txt
459-
460475
# install additional dependencies for openai api server
461476
RUN --mount=type=cache,target=/root/.cache/uv \
477+
--mount=type=bind,source=requirements/kv_connectors.txt,target=/tmp/kv_connectors.txt,ro \
462478
if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \
463-
uv pip install --system -r requirements/kv_connectors.txt; \
479+
uv pip install --system -r /tmp/kv_connectors.txt; \
464480
fi; \
465481
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
466482
BITSANDBYTES_VERSION="0.42.0"; \
12.6 KB
Loading
Lines changed: 86 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -1,94 +1,79 @@
11
#!/usr/bin/env bash
22
set -ex
33

4-
# prepare workspace directory
5-
WORKSPACE=$1
6-
if [ -z "$WORKSPACE" ]; then
7-
export WORKSPACE=$(pwd)/ep_kernels_workspace
8-
fi
4+
# usage: ./build.sh [workspace_dir] [mode]
5+
# mode: "install" (default) → install directly into current Python env
6+
# "wheel" → build wheels into WORKSPACE/dist
97

10-
if [ ! -d "$WORKSPACE" ]; then
11-
mkdir -p $WORKSPACE
12-
fi
8+
WORKSPACE=${1:-$(pwd)/ep_kernels_workspace}
9+
MODE=${2:-install}
10+
mkdir -p "$WORKSPACE"
11+
12+
WHEEL_DIR="$WORKSPACE/dist"
13+
mkdir -p "$WHEEL_DIR"
14+
NVSHMEM_VER=3.3.9
15+
16+
pushd "$WORKSPACE"
1317

14-
# configurable pip command (default: pip3)
15-
PIP_CMD=${PIP_CMD:-pip3}
1618
CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}
1719

1820
# install dependencies if not installed
19-
$PIP_CMD install cmake torch ninja
20-
21-
# build nvshmem
22-
pushd $WORKSPACE
23-
mkdir -p nvshmem_src
24-
wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz
25-
tar -xvf nvshmem_src_3.2.5-1.txz -C nvshmem_src --strip-components=1
26-
pushd nvshmem_src
27-
wget https://github.com/deepseek-ai/DeepEP/raw/main/third-party/nvshmem.patch
28-
git init
29-
git apply -vvv nvshmem.patch
30-
31-
# assume CUDA_HOME is set correctly
32-
if [ -z "$CUDA_HOME" ]; then
33-
echo "CUDA_HOME is not set, please set it to your CUDA installation directory."
34-
exit 1
21+
if [ -z "$VIRTUAL_ENV" ]; then
22+
uv pip install --system cmake torch ninja
23+
else
24+
uv pip install cmake torch ninja
3525
fi
3626

37-
# assume TORCH_CUDA_ARCH_LIST is set correctly
38-
if [ -z "$TORCH_CUDA_ARCH_LIST" ]; then
39-
echo "TORCH_CUDA_ARCH_LIST is not set, please set it to your desired architecture."
27+
# fetch nvshmem
28+
ARCH=$(uname -m)
29+
case "${ARCH,,}" in
30+
x86_64|amd64)
31+
NVSHMEM_SUBDIR="linux-x86_64"
32+
NVSHMEM_FILE="libnvshmem-linux-x86_64-${NVSHMEM_VER}_cuda12-archive.tar.xz"
33+
;;
34+
aarch64|arm64)
35+
NVSHMEM_SUBDIR="linux-sbsa"
36+
NVSHMEM_FILE="libnvshmem-linux-sbsa-${NVSHMEM_VER}_cuda12-archive.tar.xz"
37+
;;
38+
*)
39+
echo "Unsupported architecture: ${ARCH}" >&2
4040
exit 1
41-
fi
42-
43-
# disable all features except IBGDA
44-
export NVSHMEM_IBGDA_SUPPORT=1
45-
46-
export NVSHMEM_SHMEM_SUPPORT=0
47-
export NVSHMEM_UCX_SUPPORT=0
48-
export NVSHMEM_USE_NCCL=0
49-
export NVSHMEM_PMIX_SUPPORT=0
50-
export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
51-
export NVSHMEM_USE_GDRCOPY=0
52-
export NVSHMEM_IBRC_SUPPORT=0
53-
export NVSHMEM_BUILD_TESTS=0
54-
export NVSHMEM_BUILD_EXAMPLES=0
55-
export NVSHMEM_MPI_SUPPORT=0
56-
export NVSHMEM_BUILD_HYDRA_LAUNCHER=0
57-
export NVSHMEM_BUILD_TXZ_PACKAGE=0
58-
export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
59-
60-
cmake -G Ninja -S . -B $WORKSPACE/nvshmem_build/ -DCMAKE_INSTALL_PREFIX=$WORKSPACE/nvshmem_install
61-
cmake --build $WORKSPACE/nvshmem_build/ --target install
62-
41+
;;
42+
esac
43+
44+
NVSHMEM_URL="https://developer.download.nvidia.com/compute/nvshmem/redist/libnvshmem/${NVSHMEM_SUBDIR}/${NVSHMEM_FILE}"
45+
46+
pushd "$WORKSPACE"
47+
echo "Downloading NVSHMEM ${NVSHMEM_VER} for ${NVSHMEM_SUBDIR} ..."
48+
curl -fSL "${NVSHMEM_URL}" -o "${NVSHMEM_FILE}"
49+
tar -xf "${NVSHMEM_FILE}"
50+
mv "${NVSHMEM_FILE%.tar.xz}" nvshmem
51+
rm -f "${NVSHMEM_FILE}"
52+
rm -rf nvshmem/lib/bin nvshmem/lib/share
6353
popd
6454

65-
export CMAKE_PREFIX_PATH=$WORKSPACE/nvshmem_install:$CMAKE_PREFIX_PATH
55+
export CMAKE_PREFIX_PATH=$WORKSPACE/nvshmem/lib/cmake:$CMAKE_PREFIX_PATH
6656

6757
is_git_dirty() {
6858
local dir=$1
6959
pushd "$dir" > /dev/null
70-
71-
if [ -d ".git" ] && [ -n "$(git status --porcelain 2>/dev/null)" ]; then
60+
if [ -d ".git" ] && [ -n "$(git status --porcelain 3>/dev/null)" ]; then
7261
popd > /dev/null
73-
return 0 # dirty (true)
62+
return 0
7463
else
7564
popd > /dev/null
76-
return 1 # clean (false)
65+
return 1
7766
fi
7867
}
7968

80-
# Function to handle git repository cloning with dirty/incomplete checks
8169
clone_repo() {
8270
local repo_url=$1
8371
local dir_name=$2
8472
local key_file=$3
8573
local commit_hash=$4
86-
8774
if [ -d "$dir_name" ]; then
88-
# Check if directory has uncommitted changes (dirty)
8975
if is_git_dirty "$dir_name"; then
9076
echo "$dir_name directory is dirty, skipping clone"
91-
# Check if clone failed (directory exists but not a valid git repo or missing key files)
9277
elif [ ! -d "$dir_name/.git" ] || [ ! -f "$dir_name/$key_file" ]; then
9378
echo "$dir_name directory exists but clone appears incomplete, cleaning up and re-cloning"
9479
rm -rf "$dir_name"
@@ -99,7 +84,7 @@ clone_repo() {
9984
cd ..
10085
fi
10186
else
102-
echo "$dir_name directory exists and appears complete; manually update if needed"
87+
echo "$dir_name directory exists and appears complete"
10388
fi
10489
else
10590
git clone "$repo_url"
@@ -111,17 +96,44 @@ clone_repo() {
11196
fi
11297
}
11398

114-
# build and install pplx, require pytorch installed
115-
pushd $WORKSPACE
116-
clone_repo "https://github.com/ppl-ai/pplx-kernels" "pplx-kernels" "setup.py" "c336faf"
117-
cd pplx-kernels
118-
$PIP_CMD install --no-build-isolation -vvv -e .
119-
popd
99+
do_build() {
100+
local repo=$1
101+
local name=$2
102+
local key=$3
103+
local commit=$4
104+
local extra_env=$5
120105

121-
# build and install deepep, require pytorch installed
122-
pushd $WORKSPACE
123-
clone_repo "https://github.com/deepseek-ai/DeepEP" "DeepEP" "setup.py" "73b6ea4"
124-
cd DeepEP
125-
export NVSHMEM_DIR=$WORKSPACE/nvshmem_install
126-
$PIP_CMD install --no-build-isolation -vvv -e .
127-
popd
106+
pushd "$WORKSPACE"
107+
clone_repo "$repo" "$name" "$key" "$commit"
108+
cd "$name"
109+
110+
if [ "$MODE" = "install" ]; then
111+
echo "Installing $name into environment"
112+
eval "$extra_env" uv pip install --no-build-isolation -vvv .
113+
else
114+
echo "Building $name wheel into $WHEEL_DIR"
115+
eval "$extra_env" uv build --wheel --no-build-isolation -vvv --out-dir "$WHEEL_DIR" .
116+
fi
117+
popd
118+
}
119+
120+
# build pplx-kernels
121+
do_build \
122+
"https://github.com/ppl-ai/pplx-kernels" \
123+
"pplx-kernels" \
124+
"setup.py" \
125+
"12cecfd" \
126+
""
127+
128+
# build DeepEP
129+
do_build \
130+
"https://github.com/deepseek-ai/DeepEP" \
131+
"DeepEP" \
132+
"setup.py" \
133+
"73b6ea4" \
134+
"export NVSHMEM_DIR=$WORKSPACE/nvshmem; "
135+
136+
if [ "$MODE" = "wheel" ]; then
137+
echo "All wheels written to $WHEEL_DIR"
138+
ls -l "$WHEEL_DIR"
139+
fi

0 commit comments

Comments
 (0)