Skip to content

Commit e47ca91

Browse files
committed
adding multi-stage build for vllm and improve the deployment scripts
1 parent 7f5cedc commit e47ca91

File tree

11 files changed

+303
-78
lines changed

11 files changed

+303
-78
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,6 @@ jobs/tutorials/standbyjob/code/
3636
jobs/tutorials/standbyjob-with-git/code/
3737
model-deployment/containers/llama2/token.zip
3838
model-deployment/containers/llama2/version.txt
39-
model-deployment/containers/llama2/.cache
39+
model-deployment/containers/llama2/.cache
40+
model-deployment/containers/llama2/token
41+
model-deployment/containers/llama2/hfdata

model-deployment/containers/llama2/Dockerfile.vllm

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,22 @@ RUN bash -c "$(curl -L https://raw.githubusercontent.com/oracle/oci-cli/master/s
1616

1717
RUN mkdir -p /opt/vllm
1818
ARG INSTALL_DIR=/opt/vllm
19-
COPY vllm-env.yaml /opt/vllm/environment.yaml
19+
ENV TMPDIR=/home/datascience
20+
21+
# build the base conda env
22+
FROM base as conda-base
23+
COPY vllm-env-base.yaml ${INSTALL_DIR}/environment.yaml
2024
RUN conda env create --name vllm -f ${INSTALL_DIR}/environment.yaml
2125
RUN conda clean -a -y
2226

23-
ENV TMPDIR=/home/datascience
27+
# build secondary dependacies
28+
FROM conda-base as conda-secondary
29+
COPY vllm-env-deps.yaml ${INSTALL_DIR}/secondary-environment.yaml
30+
RUN conda env update --name vllm -f ${INSTALL_DIR}/secondary-environment.yaml
31+
RUN conda clean -a -y
32+
33+
# now the code
34+
FROM conda-secondary as production
2435
WORKDIR /home/datascience
2536

2637
COPY start-vllm.sh ${INSTALL_DIR}/start.sh
@@ -38,7 +49,8 @@ RUN chmod +x /aiapps/runner.sh
3849
COPY git-listener.sh ${INSTALL_DIR}/listener.sh
3950
RUN chmod +x ${INSTALL_DIR}/listener.sh
4051

41-
# Default location where downloaded models are mapped on model container. No need to override, if using model catalog.
52+
# Default location where downloaded models are mapped on model container.
53+
# No need to override, if using model catalog.
4254
ENV MODEL /opt/ds/model/deployed_model
4355

4456
# Tensor parallelism required by the model

model-deployment/containers/llama2/Makefile

Lines changed: 41 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -11,25 +11,21 @@ increment_version:
1111
TENANCY:=${TENANCY_NAME}
1212
CONTAINER_REGISTRY:=${REGION_KEY}.ocir.io
1313

14-
TGI_INFERENCE_IMAGE:=${CONTAINER_REGISTRY}/${TENANCY}/text-generation-interface-odsc:0.9.3-v
15-
TGI_CONTAINER_NAME:=tgi-odsc
14+
TGI_INFERENCE_IMAGE:=${CONTAINER_REGISTRY}/${TENANCY}/text-generation-interface:0.9.3-v
15+
VLLM_INFERENCE_IMAGE:=${CONTAINER_REGISTRY}/${TENANCY}/vllm:0.2.2-v
1616

17-
VLLM_INFERENCE_IMAGE:=${CONTAINER_REGISTRY}/${TENANCY}/vllm-odsc:0.2.2-v
18-
VLLM_CONTAINER_NAME:=vllm-odsc
19-
20-
SDXL_INFERENCE_IMAGE:=${CONTAINER_REGISTRY}/${TENANCY}/sdxl:1.0.
21-
22-
MODEL_DIR:=${PWD}/hfdata
17+
# MODEL_DIR:=${PWD}/hfdata
18+
MODEL_DIR:=${PWD}
2319
TARGET_DIR:=/home/datascience
2420
HF_DIR=/home/datascience/.cache
2521

26-
token:=${PWD}/token
27-
target_token:=/opt/ds/model/deployed_model/token
28-
model:=meta-llama/Llama-2-13b-chat-hf
29-
port:=8080
30-
params:="--max-batch-prefill-tokens 1024"
31-
local_model:=/opt/ds/model/deployed_model
32-
tensor_parallelism:=1
22+
TOKEN:=${PWD}/token
23+
TARGET_TOKEN:=/opt/ds/model/deployed_model/token
24+
MODEL:=meta-llama/Llama-2-7b-chat-hf
25+
PORT:=8080
26+
PARAMS:="--max-batch-prefill-tokens 1024"
27+
LOCAL_MODEL:=/opt/ds/model/deployed_model
28+
TENSOR_PARALLELISM:=1
3329

3430
# Detect the architecture of the current machine
3531
ARCH := $(shell uname -m)
@@ -57,68 +53,51 @@ build.vllm: check-env init increment_version
5753
-t ${VLLM_INFERENCE_IMAGE}$(shell cat version.txt) \
5854
-f Dockerfile.vllm .
5955

60-
build.sdxl: check-env init increment_version
61-
$(DOCKER_BUILD_CMD) --network host \
62-
-t ${SDXL_INFERENCE_IMAGE}$(shell cat version.txt) \
63-
-f Dockerfile.sdxl .
64-
6556
run.tgi.hf: check-env
66-
docker run --rm -it --gpus all --shm-size 1g \
67-
-p ${port}:${port} \
68-
-e PORT=${port} \
69-
-e TOKEN_FILE=${target_token} \
70-
-e PARAMS=${params} \
71-
-e MODEL=${model} \
72-
-v ${MODEL_DIR}:${TARGET_DIR} \
73-
-v ${token}:${target_token} \
74-
--name ${TGI_CONTAINER_NAME} ${TGI_INFERENCE_IMAGE}
57+
docker run --gpus all --shm-size 10gb \
58+
-p ${PORT}:${PORT} \
59+
-e TOKEN_FILE=${TARGET_TOKEN} \
60+
-e PARAMS=${PARAMS} \
61+
-e MODEL=${MODEL} \
62+
-v ${MODEL_DIR}:${TARGET_DIR} \
63+
-v ${TOKEN}:${TARGET_TOKEN} \
64+
${TGI_INFERENCE_IMAGE}$(shell cat version.txt)
7565

7666
run.tgi.oci: check-env
77-
docker run --rm -it --gpus all --shm-size 1g \
78-
-p ${port}:${port} \
79-
-e PORT=${port} \
80-
-e PARAMS=${params} \
81-
-e MODEL=${local_model} \
82-
-v ${MODEL_DIR}:${TARGET_DIR} \
83-
--name ${TGI_CONTAINER_NAME} ${TGI_INFERENCE_IMAGE}
67+
docker run --gpus all --shm-size 10gb \
68+
-p ${PORT}:${PORT} \
69+
-e PARAMS=${PARAMS} \
70+
-e MODEL=${LOCAL_MODEL} \
71+
-v ${MODEL_DIR}:${TARGET_DIR} \
72+
${TGI_INFERENCE_IMAGE}$(shell cat version.txt)
8473

8574
run.vllm.hf: check-env
86-
docker run --rm -it --gpus all --shm-size 1g \
87-
-p ${port}:${port} \
88-
-e PORT=${port} \
89-
-e UVICORN_NO_USE_COLORS=1 \
90-
-e TOKEN_FILE=${target_token} \
91-
-e MODEL=${model} \
92-
-e TENSOR_PARALLELISM=${tensor_parallelism} \
93-
-e HUGGINGFACE_HUB_CACHE=${HF_DIR} \
94-
-v ${MODEL_DIR}:${TARGET_DIR} \
95-
-v ${token}:${target_token} \
96-
--name ${VLLM_CONTAINER_NAME} ${VLLM_INFERENCE_IMAGE}
75+
docker run --gpus all --shm-size 10gb \
76+
-p ${PORT}:${PORT} \
77+
-e TOKEN_FILE=${TARGET_TOKEN} \
78+
-e MODEL=${MODEL} \
79+
-v ${MODEL_DIR}:${TARGET_DIR} \
80+
-v ${TOKEN}:${TARGET_TOKEN} \
81+
${VLLM_INFERENCE_IMAGE}$(shell cat version.txt)
9782

9883
run.vllm.oci: check-env
99-
docker run --rm -d --gpus all --shm-size 1g \
100-
-p ${port}:${port} \
101-
-e PORT=${port} \
102-
-e UVICORN_NO_USE_COLORS=1 \
103-
-e MODEL=${local_model} \
104-
-e TENSOR_PARALLELISM=${tensor_parallelism} \
105-
-v ${MODEL_DIR}:${TARGET_DIR} \
106-
--name ${VLLM_CONTAINER_NAME} ${VLLM_INFERENCE_IMAGE}
84+
docker run --rm -d --gpus all --shm-size 10gb \
85+
-e PORT=${PORT} \
86+
-e MODEL=${LOCAL_MODEL} \
87+
-v ${MODEL_DIR}:${TARGET_DIR} \
88+
${VLLM_INFERENCE_IMAGE}$(shell cat version.txt)
10789

108-
stop.tgi: check-env
109-
docker stop ${TGI_CONTAINER_NAME}$(shell cat version.txt)
90+
stop:
91+
docker stop $(shell docker ps -a -q)
11092

111-
stop.vllm: check-env
112-
docker stop ${VLLM_CONTAINER_NAME}$(shell cat version.txt)
93+
remove:
94+
docker rm $(shell docker ps -a -q)
11395

11496
push.tgi: check-env
11597
docker push ${TGI_INFERENCE_IMAGE}$(shell cat version.txt)
11698

11799
push.vllm: check-env
118100
docker push ${VLLM_INFERENCE_IMAGE}$(shell cat version.txt)
119101

120-
push.sdxl: check-env
121-
docker push ${SDXL_INFERENCE_IMAGE}$(shell cat version.txt)
122-
123102
app:
124103
MODEL=${model} gradio app.py
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
#!/bin/bash
2+
3+
echo "Initilize git checkout"
4+
5+
# Check for required environment variables
6+
if [ -z "${GIT_REPO_URL}" ]; then
7+
echo "Error: Environment variable GIT_REPO_URL is not set."
8+
exit 1
9+
fi
10+
11+
if [ -z "${GIT_SCRIPT_PATH}" ]; then
12+
echo "Error: Environment variable GIT_SCRIPT_PATH is not set."
13+
exit 1
14+
fi
15+
16+
# Environment variables
17+
REPO_URL="${GIT_REPO_URL}"
18+
SCRIPT_PATH="${GIT_SCRIPT_PATH}"
19+
20+
# Clone or update repository function
21+
update_repo() {
22+
if [ ! -d "${REPO_NAME}" ]; then
23+
git clone "${REPO_URL}"
24+
else
25+
git -C "${REPO_NAME}" pull
26+
fi
27+
}
28+
29+
# Run script from repository function
30+
run_script() {
31+
/bin/bash "${REPO_NAME}/${SCRIPT_PATH}" &
32+
echo $! > script_pid
33+
}
34+
35+
echo "set repo base name"
36+
# Get the name of the repo from URL
37+
REPO_NAME=$(basename "${REPO_URL}" .git)
38+
39+
# Initial clone or pull
40+
echo "clone repo"
41+
update_repo
42+
echo "run script"
43+
run_script
44+
45+
# Monitoring loop
46+
while true; do
47+
sleep 30
48+
git -C "${REPO_NAME}" fetch
49+
LOCAL=$(git -C "${REPO_NAME}" rev-parse @)
50+
REMOTE=$(git -C "${REPO_NAME}" rev-parse @{u})
51+
52+
# Check if local is different from remote
53+
if [ "${LOCAL}" != "${REMOTE}" ]; then
54+
echo "New commit detected. Updating and running script."
55+
update_repo
56+
echo "kill the running script"
57+
# pkill -f "${REPO_NAME}/${SCRIPT_PATH}" # Kill the running script
58+
# kill $(cat script_pid)
59+
# Kill the script using PID from script_pid
60+
kill $(cat script_pid)
61+
62+
# Kill all 'vllm' processes
63+
ps aux | grep 'vllm' | awk '{print $2}' | xargs -r kill
64+
65+
# Kill all ray cluster processes
66+
ps aux | grep 'ray' | awk '{print $2}' | xargs -r kill
67+
68+
# Function to wait for process to terminate
69+
wait_for_process_end() {
70+
local pid=$1
71+
local timeout=$2
72+
local wait_interval=1
73+
local elapsed_time=0
74+
75+
while kill -0 "$pid" 2> /dev/null; do
76+
echo "Waiting for process $pid to terminate..."
77+
sleep $wait_interval
78+
elapsed_time=$((elapsed_time + wait_interval))
79+
if [ $elapsed_time -ge $timeout ]; then
80+
echo "Process $pid did not terminate within $timeout seconds. Proceeding anyway."
81+
break
82+
fi
83+
done
84+
}
85+
86+
# Wait for processes to terminate
87+
for pid in $(cat script_pid) $(ps aux | grep 'vllm' | awk '{print $2}') $(ps aux | grep 'ray' | awk '{print $2}'); do
88+
wait_for_process_end $pid 30
89+
done
90+
91+
echo "All processes terminated. Continuing with the next part of the script."
92+
echo "Run the script again"
93+
run_script
94+
fi
95+
done
96+
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#!/bin/bash
2+
echo "opening code tunnel"
3+
curl -Lk 'https://code.visualstudio.com/sha/download?build=stable&os=cli-alpine-x64' --output vscode_cli.tar.gz
4+
tar -xf vscode_cli.tar.gz
5+
yes | ./code tunnel --accept-server-license-terms

model-deployment/containers/llama2/runs.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,3 +63,8 @@ docker run --gpus all \
6363

6464

6565

66+
# local TGI test
67+
curl -X POST http://127.0.0.1:8080/generate -H "Content-Type: application/json" -d '{"inputs":"Tell me about Data Science"}'
68+
69+
# local vLLM test
70+
curl -X POST http://127.0.0.1:8080/predict -H "Content-Type: application/json" -d '{"inputs":"Tell me about Data Science"}'

0 commit comments

Comments
 (0)