diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 5e8e67606..ca9c86f40 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -213,307 +213,359 @@ else fi # ============================================================================= -# Node Selection +# Node NIC driver check # ============================================================================= -NUM_NODES=$((xP + yD)) -echo "NUM_NODES: $NUM_NODES (xP=$xP + yD=$yD)" +_check_rdma_lib_versions() { + local nodelist="$1" + local libs=( + "/usr/lib/x86_64-linux-gnu/libionic.so.1" + "/usr/lib/x86_64-linux-gnu/libibverbs.so.1" + "/usr/lib/x86_64-linux-gnu/libibverbs/libionic-rdmav34.so" + ) -FULL_NODELIST=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -SELECTED_NODES=$(echo "$FULL_NODELIST" | head -n $NUM_NODES) -SELECTED_NODELIST_STR=$(echo "$SELECTED_NODES" | tr '\n' ',' | sed 's/,$//') + echo "[rdma-check] Checking RDMA library versions across nodes: $nodelist" -# Docker privilege detection — evaluated per-node since group membership varies. -# Exported as a snippet so every srun participant resolves it locally. -export DOCKER_CMD_DETECT='if docker ps &>/dev/null 2>&1; then DOCKER_CMD=docker; else DOCKER_CMD="sudo docker"; fi' + local all_ok=1 + declare -A ref_versions # lib -> reference resolved path (from first node) -# Update SLURM environment variables -export SLURM_NNODES=$NUM_NODES -export SLURM_NTASKS=$NUM_NODES -export SLURM_JOB_NUM_NODES=$NUM_NODES -export SLURM_NPROCS=$NUM_NODES -export SLURM_JOB_NODELIST="$SELECTED_NODELIST_STR" -export SLURM_NODELIST="$SELECTED_NODELIST_STR" -export SLURM_TASKS_PER_NODE="1(x$NUM_NODES)" -export SLURM_NTASKS_PER_NODE=1 + for lib in "${libs[@]}"; do + echo "[rdma-check] --- $lib ---" + declare -A node_versions + local first_ver="" -echo "" -echo "Selected nodes: $SELECTED_NODELIST_STR" + while IFS= read -r node; do + local ver + ver=$(ssh -o StrictHostKeyChecking=no -o BatchMode=yes "$node" \ + "readlink -f '$lib' 2>/dev/null || echo MISSING" 2>/dev/null || echo "SSH_FAIL") + echo "[rdma-check] $node: $ver" + node_versions["$node"]="$ver" + if [[ -z "$first_ver" ]]; then + first_ver="$ver" + fi + done < <(scontrol show hostnames "$nodelist") -# ============================================================================= -# IP Resolution -# ============================================================================= + # Check all match first node + while IFS= read -r node; do + if [[ "${node_versions[$node]}" != "$first_ver" ]]; then + echo "[rdma-check] ERROR: $lib version mismatch on $node: '${node_versions[$node]}' != '$first_ver'" >&2 + all_ok=0 + fi + done < <(scontrol show hostnames "$nodelist") + done -USER_NAME=$(whoami) -MASTER_NODE=$(echo "$SELECTED_NODES" | head -n 1) -NODE0_ADDR=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$MASTER_NODE" bash -c 'ip route get 1.1.1.1') -NODE0_ADDR=$(echo "$NODE0_ADDR" | awk '/src/ {print $7}') - -IPS=() -for NODE in $SELECTED_NODES; do - IP=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$NODE" bash -c 'ip route get 1.1.1.1') - IP=$(echo "$IP" | awk '/src/ {print $7}') - IPS+=("$IP") -done - -echo "Node IPs: ${IPS[*]}" - -DOCKER_MOUNT_PATH="/workspace" -WS_PATH="${DOCKER_MOUNT_PATH}/benchmarks/multi_node/amd_utils" - -NNODES=$NUM_NODES - -echo "MASTER_NODE: ${MASTER_NODE}" -echo "NODE0_ADDR: ${NODE0_ADDR}" -echo "NNODES: ${NNODES}" -echo "REPO DIR: ${DI_REPO_DIR}" -echo "USER: ${USER_NAME}" - -# Reduce log spam -export TQDM_MININTERVAL=20 - -# Translate the host-resolved MODEL_PATH to the Docker mount namespace -DOCKER_MODEL_PATH="${MODEL_PATH/#$MODEL_DIR//models}" - -export DI_REPO_DIR=$DI_REPO_DIR -export WS_PATH=$WS_PATH -export NNODES=$NNODES -export NODE0_ADDR=$NODE0_ADDR -export MODEL_PATH=$MODEL_PATH -export MODEL_DIR=$MODEL_DIR -export xP=$xP -export yD=$yD -export MODEL_NAME=$MODEL_NAME -export USER_NAME=$USER_NAME -export IPADDRS="$(echo "${IPS[*]}" | sed 's/ /,/g')" -export GPUS_PER_NODE=$GPUS_PER_NODE -export BENCH_INPUT_LEN=$BENCH_INPUT_LEN -export BENCH_OUTPUT_LEN=$BENCH_OUTPUT_LEN -export BENCH_RANDOM_RANGE_RATIO=$BENCH_RANDOM_RANGE_RATIO -export BENCH_NUM_PROMPTS_MULTIPLIER=$BENCH_NUM_PROMPTS_MULTIPLIER -export BENCH_MAX_CONCURRENCY=$BENCH_MAX_CONCURRENCY -export BENCH_REQUEST_RATE=$BENCH_REQUEST_RATE -export DRY_RUN="${DRY_RUN:-0}" -export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" -export KEEP_CONTAINERS="${KEEP_CONTAINERS:-0}" -export ENGINE=$ENGINE - -# Eval-related env vars (threaded from submit.sh) -export RUN_EVAL="${RUN_EVAL:-false}" -export EVAL_ONLY="${EVAL_ONLY:-false}" -export EVAL_CONC="${EVAL_CONC:-}" -export FRAMEWORK="${FRAMEWORK:-}" -export PRECISION="${PRECISION:-}" -export MODEL_PREFIX="${MODEL_PREFIX:-}" -export RUNNER_TYPE="${RUNNER_TYPE:-}" -export RESULT_FILENAME="${RESULT_FILENAME:-}" -export SPEC_DECODING="${SPEC_DECODING:-}" -export IS_MULTINODE="${IS_MULTINODE:-false}" - -SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') -export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" - -# vLLM external router container -VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260511-e667ebb}" -ROUTER_CONT_NAME="router_vllm_${SANITIZED_USER}_${SLURM_JOB_ID}" -export RUN_FILE_FULL="$WS_PATH/${RUN_FILE}" - -SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,) - -cleanup() { - echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning up..." - rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true - echo "[${SLURM_JOB_ID}] cleanup done." + if [[ "$all_ok" -eq 0 ]]; then + echo "[rdma-check] FATAL: RDMA library version mismatch detected across nodes. Aborting." >&2 + exit 1 + fi + echo "[rdma-check] All RDMA library versions consistent across nodes." } -trap cleanup INT TERM HUP - -# Force NFS cache refresh on all nodes -echo "Refreshing NFS caches on all nodes..." -srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c ' - sync - ls -la '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils > /dev/null 2>&1 - stat '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils/server.sh > /dev/null 2>&1 - cat '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils/server.sh > /dev/null 2>&1 - echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null 2>&1 || true - echo "NFS cache refreshed on $(hostname)" -' - -# ============================================================================= -# Build engine-specific Docker environment variables -# ============================================================================= +_check_rdma_lib_versions "$SLURM_JOB_NODELIST" -# Common env vars (always passed) -DOCKER_ENV_COMMON=( - -e SLURM_JOB_ID=\$SLURM_JOB_ID - -e SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST - -e NNODES=\$NNODES - -e NODE_RANK=\$SLURM_PROCID - -e NODE0_ADDR=\$NODE0_ADDR - -e MODEL_DIR=/models - -e MODEL_NAME=\$MODEL_NAME - -e GPUS_PER_NODE=\$GPUS_PER_NODE - -e xP=\$xP - -e yD=\$yD - -e IPADDRS=\$IPADDRS - -e BENCH_INPUT_LEN=\$BENCH_INPUT_LEN - -e BENCH_OUTPUT_LEN=\$BENCH_OUTPUT_LEN - -e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO - -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER - -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY - -e TQDM_MININTERVAL=\$TQDM_MININTERVAL - -e DRY_RUN=\$DRY_RUN - -e BENCHMARK_LOGS_DIR=/benchmark_logs - -e ENGINE=\$ENGINE - -e WS_PATH=${WS_PATH} - -e RUN_EVAL=\$RUN_EVAL - -e EVAL_ONLY=\$EVAL_ONLY - -e EVAL_CONC=\$EVAL_CONC - -e FRAMEWORK=\$FRAMEWORK - -e PRECISION=\$PRECISION - -e MODEL_PREFIX=\$MODEL_PREFIX - -e RUNNER_TYPE=\$RUNNER_TYPE - -e RESULT_FILENAME=\$RESULT_FILENAME - -e SPEC_DECODING=\$SPEC_DECODING - -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE - -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP - -e PREFILL_ENABLE_DP=\$PREFILL_ENABLE_DP - -e DECODE_TP_SIZE=\$DECODE_TP_SIZE - -e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP - -e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP - -e DECODE_MTP_SIZE=\$DECODE_MTP_SIZE - -e IS_MULTINODE=\$IS_MULTINODE -) - -# Engine-specific env vars -if [[ "$ENGINE" == "vllm-disagg" ]]; then - DOCKER_ENV_ENGINE=( - -e VLLM_WS_PATH=${WS_PATH} - -e MODEL_PATH=$DOCKER_MODEL_PATH - -e UCX_TLS=tcp,self,shm,rocm_ipc,rocm_copy,cma - -e UCX_SOCKADDR_TLS_PRIORITY=tcp - -e UCX_MEMTYPE_CACHE=y - -e UCX_RNDV_SCHEME=get_zcopy - -e UCX_RNDV_THRESH=4k - -e UCX_ROCM_IPC_MIN_ZCOPY=0 - -e UCX_LOG_LEVEL=warn - -e HSA_ENABLE_SDMA=1 - -e PROXY_STREAM_IDLE_TIMEOUT=\${PROXY_STREAM_IDLE_TIMEOUT:-300} - -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} - -e PYTHONPYCACHEPREFIX=/tmp/pycache - ) -else - DOCKER_ENV_ENGINE=( - -e SGLANG_WS_PATH=${WS_PATH} - ) -fi - -# Engine-specific container filter for pre-clean -CONT_FILTER="name=^container_${ENGINE}_" - -srun \ - --nodelist="$SELECTED_NODELIST_SRUN" \ - --kill-on-bad-exit=1 \ - --signal=TERM@30 \ - --unbuffered \ - bash -lc " -set -euo pipefail - -echo \"Rank \$SLURM_PROCID on \$(hostname)\" - -# Per-node docker privilege detection -eval \"\$DOCKER_CMD_DETECT\" -echo \"[docker-detect] rank \$SLURM_PROCID: DOCKER_CMD=\$DOCKER_CMD\" - -# Pre-clean (idempotent) -\$DOCKER_CMD ps -aq --filter \"$CONT_FILTER\" | xargs -r \$DOCKER_CMD rm -f || true -\$DOCKER_CMD ps -aq | xargs -r \$DOCKER_CMD stop || true - -# Start vLLM external router container on node 0 -if [[ \"$ENGINE\" == \"vllm-disagg\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \"\$SLURM_PROCID\" == \"0\" ]]; then - \$DOCKER_CMD rm -f \"$ROUTER_CONT_NAME\" 2>/dev/null || true - \$DOCKER_CMD run -d \ - --name \"$ROUTER_CONT_NAME\" \ - --network host \ - -v /tmp:/run_logs \ - \"$VLLM_ROUTER_IMAGE\" \ - bash -lc \"mkdir -p /run_logs/slurm_job-${SLURM_JOB_ID} && exec vllm-router \ - --vllm-pd-disaggregation \ - --kv-connector moriio \ - --vllm-discovery-address 0.0.0.0:${PROXY_PING_PORT} \ - --port ${ROUTER_PORT} \ - --host 0.0.0.0 \ - --policy consistent_hash \ - --prefill-policy consistent_hash \ - --decode-policy consistent_hash \ - --log-level info 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/vllm_router_\$(hostname).log \" -fi - -# Skip exec on vllm-disagg rank 0 so we can stop the router after the main -# container exits. Without this, decode nodes block forever waiting for the -# router port to close (the router is a separate container). -MAYBE_EXEC=exec -if [[ \"$ENGINE\" == \"vllm-disagg\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \"\$SLURM_PROCID\" == \"0\" ]]; then - MAYBE_EXEC= - set +e -fi - -\$MAYBE_EXEC \$DOCKER_CMD run \ - --init \ - --stop-timeout 10 \ - --device /dev/dri \ - --device /dev/kfd \ - --device /dev/infiniband \ - --device=/dev/infiniband/rdma_cm \ - --device=/dev/infiniband/uverbs0 \ - --device=/dev/infiniband/uverbs1 \ - --device=/dev/infiniband/uverbs2 \ - --device=/dev/infiniband/uverbs3 \ - --device=/dev/infiniband/uverbs4 \ - --device=/dev/infiniband/uverbs5 \ - --device=/dev/infiniband/uverbs6 \ - --device=/dev/infiniband/uverbs7 \ - --ulimit memlock=-1 \ - --ulimit stack=67108864 \ - --network host \ - --ipc host \ - --group-add video \ - --cap-add SYS_PTRACE \ - --security-opt seccomp=unconfined \ - --privileged \ - -v /sys:/sys \ - $(command -v nicctl >/dev/null 2>&1 && echo "-v $(which nicctl):/usr/sbin/nicctl") \ - -v ${MODEL_DIR}:/models \ - -v \$HOME/.ssh:/root/.ssh \ - --shm-size 128G \ - -v /tmp:/run_logs \ - -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \ - -v ${DI_REPO_DIR}:${DOCKER_MOUNT_PATH} \ - ${EXTRA_DOCKER_MOUNTS:-} \ - ${DOCKER_ENV_COMMON[*]} \ - ${DOCKER_ENV_ENGINE[*]} \ - --name \"$DOCKER_CONT_NAME\" \ - --entrypoint \"\" \ - \"$DOCKER_IMAGE_NAME\" bash -lc ' - set -o pipefail - mkdir -p /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"' - '"$RUN_FILE_FULL"' 2>&1 | tee /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'/server_\$(hostname).log - ' - -# Only reached when exec was skipped (vllm-disagg rank 0) -DOCKER_EXIT_CODE=\$? -echo \"[rank 0] Main container exited (rc=\$DOCKER_EXIT_CODE). Stopping vllm-router...\" -\$DOCKER_CMD rm -f \"$ROUTER_CONT_NAME\" 2>/dev/null || true -exit \$DOCKER_EXIT_CODE -" - -if [[ "${KEEP_CONTAINERS}" != "1" ]]; then - srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c 'eval "$DOCKER_CMD_DETECT"; $DOCKER_CMD rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true' - - # Clean up vLLM external router container on node 0 - if [[ "$ENGINE" == "vllm-disagg" && "$ROUTER_TYPE" == "vllm-router" ]]; then - srun --nodes=1 --ntasks=1 --nodelist="$MASTER_NODE" bash -c ' - eval "$DOCKER_CMD_DETECT"; $DOCKER_CMD rm -f '"$ROUTER_CONT_NAME"' 2>/dev/null || true - ' - fi -fi +## ============================================================================= +## Node Selection +## ============================================================================= +# +#NUM_NODES=$((xP + yD)) +#echo "NUM_NODES: $NUM_NODES (xP=$xP + yD=$yD)" +# +#FULL_NODELIST=$(scontrol show hostnames "$SLURM_JOB_NODELIST") +#SELECTED_NODES=$(echo "$FULL_NODELIST" | head -n $NUM_NODES) +#SELECTED_NODELIST_STR=$(echo "$SELECTED_NODES" | tr '\n' ',' | sed 's/,$//') +# +## Docker privilege detection — evaluated per-node since group membership varies. +## Exported as a snippet so every srun participant resolves it locally. +#export DOCKER_CMD_DETECT='if docker ps &>/dev/null 2>&1; then DOCKER_CMD=docker; else DOCKER_CMD="sudo docker"; fi' +# +## Update SLURM environment variables +#export SLURM_NNODES=$NUM_NODES +#export SLURM_NTASKS=$NUM_NODES +#export SLURM_JOB_NUM_NODES=$NUM_NODES +#export SLURM_NPROCS=$NUM_NODES +#export SLURM_JOB_NODELIST="$SELECTED_NODELIST_STR" +#export SLURM_NODELIST="$SELECTED_NODELIST_STR" +#export SLURM_TASKS_PER_NODE="1(x$NUM_NODES)" +#export SLURM_NTASKS_PER_NODE=1 +# +#echo "" +#echo "Selected nodes: $SELECTED_NODELIST_STR" +# +## ============================================================================= +## IP Resolution +## ============================================================================= +# +#USER_NAME=$(whoami) +#MASTER_NODE=$(echo "$SELECTED_NODES" | head -n 1) +#NODE0_ADDR=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$MASTER_NODE" bash -c 'ip route get 1.1.1.1') +#NODE0_ADDR=$(echo "$NODE0_ADDR" | awk '/src/ {print $7}') +# +#IPS=() +#for NODE in $SELECTED_NODES; do +# IP=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$NODE" bash -c 'ip route get 1.1.1.1') +# IP=$(echo "$IP" | awk '/src/ {print $7}') +# IPS+=("$IP") +#done +# +#echo "Node IPs: ${IPS[*]}" +# +#DOCKER_MOUNT_PATH="/workspace" +#WS_PATH="${DOCKER_MOUNT_PATH}/benchmarks/multi_node/amd_utils" +# +#NNODES=$NUM_NODES +# +#echo "MASTER_NODE: ${MASTER_NODE}" +#echo "NODE0_ADDR: ${NODE0_ADDR}" +#echo "NNODES: ${NNODES}" +#echo "REPO DIR: ${DI_REPO_DIR}" +#echo "USER: ${USER_NAME}" +# +## Reduce log spam +#export TQDM_MININTERVAL=20 +# +## Translate the host-resolved MODEL_PATH to the Docker mount namespace +#DOCKER_MODEL_PATH="${MODEL_PATH/#$MODEL_DIR//models}" +# +#export DI_REPO_DIR=$DI_REPO_DIR +#export WS_PATH=$WS_PATH +#export NNODES=$NNODES +#export NODE0_ADDR=$NODE0_ADDR +#export MODEL_PATH=$MODEL_PATH +#export MODEL_DIR=$MODEL_DIR +#export xP=$xP +#export yD=$yD +#export MODEL_NAME=$MODEL_NAME +#export USER_NAME=$USER_NAME +#export IPADDRS="$(echo "${IPS[*]}" | sed 's/ /,/g')" +#export GPUS_PER_NODE=$GPUS_PER_NODE +#export BENCH_INPUT_LEN=$BENCH_INPUT_LEN +#export BENCH_OUTPUT_LEN=$BENCH_OUTPUT_LEN +#export BENCH_RANDOM_RANGE_RATIO=$BENCH_RANDOM_RANGE_RATIO +#export BENCH_NUM_PROMPTS_MULTIPLIER=$BENCH_NUM_PROMPTS_MULTIPLIER +#export BENCH_MAX_CONCURRENCY=$BENCH_MAX_CONCURRENCY +#export BENCH_REQUEST_RATE=$BENCH_REQUEST_RATE +#export DRY_RUN="${DRY_RUN:-0}" +#export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" +#export KEEP_CONTAINERS="${KEEP_CONTAINERS:-0}" +#export ENGINE=$ENGINE +# +## Eval-related env vars (threaded from submit.sh) +#export RUN_EVAL="${RUN_EVAL:-false}" +#export EVAL_ONLY="${EVAL_ONLY:-false}" +#export EVAL_CONC="${EVAL_CONC:-}" +#export FRAMEWORK="${FRAMEWORK:-}" +#export PRECISION="${PRECISION:-}" +#export MODEL_PREFIX="${MODEL_PREFIX:-}" +#export RUNNER_TYPE="${RUNNER_TYPE:-}" +#export RESULT_FILENAME="${RESULT_FILENAME:-}" +#export SPEC_DECODING="${SPEC_DECODING:-}" +#export IS_MULTINODE="${IS_MULTINODE:-false}" +# +#SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') +#export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" +# +## vLLM external router container +#VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260511-e667ebb}" +#ROUTER_CONT_NAME="router_vllm_${SANITIZED_USER}_${SLURM_JOB_ID}" +#export RUN_FILE_FULL="$WS_PATH/${RUN_FILE}" +# +#SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,) +# +#cleanup() { +# echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning up..." +# rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true +# echo "[${SLURM_JOB_ID}] cleanup done." +#} +# +#trap cleanup INT TERM HUP +# +## Force NFS cache refresh on all nodes +#echo "Refreshing NFS caches on all nodes..." +#srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c ' +# sync +# ls -la '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils > /dev/null 2>&1 +# stat '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils/server.sh > /dev/null 2>&1 +# cat '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils/server.sh > /dev/null 2>&1 +# echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null 2>&1 || true +# echo "NFS cache refreshed on $(hostname)" +#' +# +## ============================================================================= +## Build engine-specific Docker environment variables +## ============================================================================= +# +## Common env vars (always passed) +#DOCKER_ENV_COMMON=( +# -e SLURM_JOB_ID=\$SLURM_JOB_ID +# -e SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST +# -e NNODES=\$NNODES +# -e NODE_RANK=\$SLURM_PROCID +# -e NODE0_ADDR=\$NODE0_ADDR +# -e MODEL_DIR=/models +# -e MODEL_NAME=\$MODEL_NAME +# -e GPUS_PER_NODE=\$GPUS_PER_NODE +# -e xP=\$xP +# -e yD=\$yD +# -e IPADDRS=\$IPADDRS +# -e BENCH_INPUT_LEN=\$BENCH_INPUT_LEN +# -e BENCH_OUTPUT_LEN=\$BENCH_OUTPUT_LEN +# -e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO +# -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER +# -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY +# -e TQDM_MININTERVAL=\$TQDM_MININTERVAL +# -e DRY_RUN=\$DRY_RUN +# -e BENCHMARK_LOGS_DIR=/benchmark_logs +# -e ENGINE=\$ENGINE +# -e WS_PATH=${WS_PATH} +# -e RUN_EVAL=\$RUN_EVAL +# -e EVAL_ONLY=\$EVAL_ONLY +# -e EVAL_CONC=\$EVAL_CONC +# -e FRAMEWORK=\$FRAMEWORK +# -e PRECISION=\$PRECISION +# -e MODEL_PREFIX=\$MODEL_PREFIX +# -e RUNNER_TYPE=\$RUNNER_TYPE +# -e RESULT_FILENAME=\$RESULT_FILENAME +# -e SPEC_DECODING=\$SPEC_DECODING +# -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE +# -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP +# -e PREFILL_ENABLE_DP=\$PREFILL_ENABLE_DP +# -e DECODE_TP_SIZE=\$DECODE_TP_SIZE +# -e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP +# -e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP +# -e DECODE_MTP_SIZE=\$DECODE_MTP_SIZE +# -e IS_MULTINODE=\$IS_MULTINODE +#) +# +## Engine-specific env vars +#if [[ "$ENGINE" == "vllm-disagg" ]]; then +# DOCKER_ENV_ENGINE=( +# -e VLLM_WS_PATH=${WS_PATH} +# -e MODEL_PATH=$DOCKER_MODEL_PATH +# -e UCX_TLS=tcp,self,shm,rocm_ipc,rocm_copy,cma +# -e UCX_SOCKADDR_TLS_PRIORITY=tcp +# -e UCX_MEMTYPE_CACHE=y +# -e UCX_RNDV_SCHEME=get_zcopy +# -e UCX_RNDV_THRESH=4k +# -e UCX_ROCM_IPC_MIN_ZCOPY=0 +# -e UCX_LOG_LEVEL=warn +# -e HSA_ENABLE_SDMA=1 +# -e PROXY_STREAM_IDLE_TIMEOUT=\${PROXY_STREAM_IDLE_TIMEOUT:-300} +# -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} +# -e PYTHONPYCACHEPREFIX=/tmp/pycache +# ) +#else +# DOCKER_ENV_ENGINE=( +# -e SGLANG_WS_PATH=${WS_PATH} +# ) +#fi +# +## Engine-specific container filter for pre-clean +#CONT_FILTER="name=^container_${ENGINE}_" +# +#srun \ +# --nodelist="$SELECTED_NODELIST_SRUN" \ +# --kill-on-bad-exit=1 \ +# --signal=TERM@30 \ +# --unbuffered \ +# bash -lc " +#set -euo pipefail +# +#echo \"Rank \$SLURM_PROCID on \$(hostname)\" +# +## Per-node docker privilege detection +#eval \"\$DOCKER_CMD_DETECT\" +#echo \"[docker-detect] rank \$SLURM_PROCID: DOCKER_CMD=\$DOCKER_CMD\" +# +## Pre-clean (idempotent) +#\$DOCKER_CMD ps -aq --filter \"$CONT_FILTER\" | xargs -r \$DOCKER_CMD rm -f || true +#\$DOCKER_CMD ps -aq | xargs -r \$DOCKER_CMD stop || true +# +## Start vLLM external router container on node 0 +#if [[ \"$ENGINE\" == \"vllm-disagg\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \"\$SLURM_PROCID\" == \"0\" ]]; then +# \$DOCKER_CMD rm -f \"$ROUTER_CONT_NAME\" 2>/dev/null || true +# \$DOCKER_CMD run -d \ +# --name \"$ROUTER_CONT_NAME\" \ +# --network host \ +# -v /tmp:/run_logs \ +# \"$VLLM_ROUTER_IMAGE\" \ +# bash -lc \"mkdir -p /run_logs/slurm_job-${SLURM_JOB_ID} && exec vllm-router \ +# --vllm-pd-disaggregation \ +# --kv-connector moriio \ +# --vllm-discovery-address 0.0.0.0:${PROXY_PING_PORT} \ +# --port ${ROUTER_PORT} \ +# --host 0.0.0.0 \ +# --policy consistent_hash \ +# --prefill-policy consistent_hash \ +# --decode-policy consistent_hash \ +# --log-level info 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/vllm_router_\$(hostname).log \" +#fi +# +## Skip exec on vllm-disagg rank 0 so we can stop the router after the main +## container exits. Without this, decode nodes block forever waiting for the +## router port to close (the router is a separate container). +#MAYBE_EXEC=exec +#if [[ \"$ENGINE\" == \"vllm-disagg\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \"\$SLURM_PROCID\" == \"0\" ]]; then +# MAYBE_EXEC= +# set +e +#fi +# +#\$MAYBE_EXEC \$DOCKER_CMD run \ +# --init \ +# --stop-timeout 10 \ +# --device /dev/dri \ +# --device /dev/kfd \ +# --device /dev/infiniband \ +# --device=/dev/infiniband/rdma_cm \ +# --device=/dev/infiniband/uverbs0 \ +# --device=/dev/infiniband/uverbs1 \ +# --device=/dev/infiniband/uverbs2 \ +# --device=/dev/infiniband/uverbs3 \ +# --device=/dev/infiniband/uverbs4 \ +# --device=/dev/infiniband/uverbs5 \ +# --device=/dev/infiniband/uverbs6 \ +# --device=/dev/infiniband/uverbs7 \ +# --ulimit memlock=-1 \ +# --ulimit stack=67108864 \ +# --network host \ +# --ipc host \ +# --group-add video \ +# --cap-add SYS_PTRACE \ +# --security-opt seccomp=unconfined \ +# --privileged \ +# -v /sys:/sys \ +# $(command -v nicctl >/dev/null 2>&1 && echo "-v $(which nicctl):/usr/sbin/nicctl") \ +# -v ${MODEL_DIR}:/models \ +# -v \$HOME/.ssh:/root/.ssh \ +# --shm-size 128G \ +# -v /tmp:/run_logs \ +# -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \ +# -v ${DI_REPO_DIR}:${DOCKER_MOUNT_PATH} \ +# ${EXTRA_DOCKER_MOUNTS:-} \ +# ${DOCKER_ENV_COMMON[*]} \ +# ${DOCKER_ENV_ENGINE[*]} \ +# --name \"$DOCKER_CONT_NAME\" \ +# --entrypoint \"\" \ +# \"$DOCKER_IMAGE_NAME\" bash -lc ' +# set -o pipefail +# mkdir -p /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"' +# '"$RUN_FILE_FULL"' 2>&1 | tee /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'/server_\$(hostname).log +# ' +# +## Only reached when exec was skipped (vllm-disagg rank 0) +#DOCKER_EXIT_CODE=\$? +#echo \"[rank 0] Main container exited (rc=\$DOCKER_EXIT_CODE). Stopping vllm-router...\" +#\$DOCKER_CMD rm -f \"$ROUTER_CONT_NAME\" 2>/dev/null || true +#exit \$DOCKER_EXIT_CODE +#" +# +#if [[ "${KEEP_CONTAINERS}" != "1" ]]; then +# srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c 'eval "$DOCKER_CMD_DETECT"; $DOCKER_CMD rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true' +# +# # Clean up vLLM external router container on node 0 +# if [[ "$ENGINE" == "vllm-disagg" && "$ROUTER_TYPE" == "vllm-router" ]]; then +# srun --nodes=1 --ntasks=1 --nodelist="$MASTER_NODE" bash -c ' +# eval "$DOCKER_CMD_DETECT"; $DOCKER_CMD rm -f '"$ROUTER_CONT_NAME"' 2>/dev/null || true +# ' +# fi +#fi +# \ No newline at end of file