Skip to content

Commit 0249329

Browse files
dannawang0221juncgu-google
authored andcommitted
Update benchmark pods
Signed-off-by: dannawang <dannawang@google.com>
1 parent a3ff52b commit 0249329

File tree

2 files changed

+10
-2
lines changed

2 files changed

+10
-2
lines changed

examples/offload/gke/benchmarks/deploy-baseline.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ spec:
2121
imagePullPolicy: Always
2222
command: ["/bin/sh", "-c"]
2323
args:
24-
- "vllm serve meta-llama/Llama-3.3-70B-Instruct --port 8000 --max_num_batched_tokens 2048 --enable-chunked-prefill --tensor-parallel-size 8 --seed 42 --enable_prefix_caching --gpu-memory-utilization 0.9"
24+
- "vllm serve meta-llama/Llama-3.3-70B-Instruct --port 8000 --enable-chunked-prefill --tensor-parallel-size 8 --seed 42 --enable_prefix_caching --gpu-memory-utilization 0.9"
2525
env:
2626
- name: HUGGING_FACE_HUB_TOKEN
2727
valueFrom:

examples/offload/gke/benchmarks/deploy-cpu-offload.yaml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,21 @@ spec:
1515
nodeSelector:
1616
cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice
1717
cloud.google.com/gke-tpu-topology: 2x4 # Specify the physical topology for the TPU slice.
18+
initContainers:
19+
- name: increase-vm-max-map-count
20+
image: busybox
21+
# WARNING: This changes the HOST memory settings (vm.max_map_count), not just the container.
22+
# Required to prevent vLLM crashes due to memory mapping limits.
23+
command: ["sysctl", "-w", "vm.max_map_count=1048576"]
24+
securityContext:
25+
privileged: true
1826
containers:
1927
- name: tpu-job
2028
image: <your-tpu-inference-container-image>
2129
imagePullPolicy: Always
2230
command: ["/bin/sh", "-c"]
2331
args:
24-
- "vllm serve meta-llama/Llama-3.3-70B-Instruct --kv-transfer-config '{\"kv_connector\":\"TPUOffloadConnector\",\"kv_role\":\"kv_both\",\"kv_connector_module_path\":\"tpu_inference.distributed.offload.tpu_offload_connector\"}' --port 8000 --max_num_batched_tokens 2048 --enable-chunked-prefill --tensor-parallel-size 8 --seed 42 --enable_prefix_caching --gpu-memory-utilization 0.9"
32+
- "vllm serve meta-llama/Llama-3.3-70B-Instruct --kv-transfer-config '{\"kv_connector\":\"TPUOffloadConnector\",\"kv_role\":\"kv_both\",\"kv_connector_module_path\":\"tpu_inference.distributed.offload.tpu_offload_connector\"}' --port 8000 --enable-chunked-prefill --tensor-parallel-size 8 --seed 42 --enable_prefix_caching --gpu-memory-utilization 0.9"
2533
env:
2634
- name: HUGGING_FACE_HUB_TOKEN
2735
valueFrom:

0 commit comments

Comments
 (0)