SemiAnalysisAI · JordanNanos · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026
diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml
@@ -71,10 +71,19 @@ mi300x:
 - 'mi300x-amds_2'
 - 'mi300x-amds_3'
 mi325x:
-- 'mi325x-amd_0'
-- 'mi325x-amd_1'
-- 'mi325x-amd_2'
-- 'mi325x-amd_3'
+- 'mi325x-amds_00'
+- 'mi325x-amds_01'
+- 'mi325x-amds_02'
+- 'mi325x-amds_03'
+- 'mi325x-amds_04'
+- 'mi325x-amds_05'
+- 'mi325x-amds_06'
+- 'mi325x-amds_08'
+mi325x-disagg:
+- 'mi325x-amds_00'
+- 'mi325x-amds_01'
+- 'mi325x-amds_02'
+- 'mi325x-amds_03'
 mi355x:
 - 'mi355x-amds_0'
 - 'mi355x-amds_1'

diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh
@@ -57,6 +57,7 @@ for max_concurrency in ${chosen_concurrencies[@]}; do
         --max-concurrency "$max_concurrency" \
         --result-filename "$export_file" \
         --result-dir /workspace/ \
+        --trust-remote-code \
         $( [ "$IS_MTP" = "true" ] && echo "--use-chat-template" )
 
     echo "-----------------------------------------"

diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
@@ -20,6 +20,9 @@ if [[ -z "$IBDEVICES" ]]; then
         export IBDEVICES=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7
     elif [[ $NODENAME == mia1* ]]; then
         export IBDEVICES=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
+    elif [[ $NODENAME == chi-mi325x* ]]; then
+        # Vultr/CPE MI325X cluster: Broadcom RoCE (bnxt_re); bnxt_re6 is DOWN, skip it
+        export IBDEVICES=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re7,bnxt_re8
     else
         echo "ERROR: Unable to detect cluster from hostname $NODENAME and IBDEVICES not set" >&2
         exit 1
@@ -42,6 +45,13 @@ export SGLANG_USE_AITER=1
 export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=1200
 export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=1200
 
+# GLM-5: uses NSA (not MLA), needs fused-decode-MLA disabled + fast loading
+if [[ "$MODEL_NAME" == *GLM-5* ]]; then
+    export SGLANG_ROCM_FUSED_DECODE_MLA=0
+    export ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+    export SAFETENSORS_FAST_GPU=1
+fi
+
 # Disable allocating memory in one pass
 export MORI_SHMEM_MODE=ISOLATION
 export SGLANG_MORI_FP8_DISP=True
@@ -64,8 +74,11 @@ export MORI_MAX_DISPATCH_TOKENS_DECODE=160
 export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2))
 
 export MORI_EP_LAUNCH_CONFIG_MODE=AUTO
-export MORI_IO_QP_MAX_SEND_WR=16384
-export MORI_IO_QP_MAX_CQE=32768
+# Broadcom bnxt_re NICs cap SQ depth at ~4351 entries. Lower from upstream
+# defaults (16384/32768) to avoid SQ overflow under EP8 RDMA traffic.
+# See sgl-project/sglang#22072
+export MORI_IO_QP_MAX_SEND_WR=4096
+export MORI_IO_QP_MAX_CQE=8192
 export MORI_IO_QP_MAX_SGE=4
 
 export MORI_APP_LOG_LEVEL=INFO
@@ -101,6 +114,11 @@ $1 == "DSCP" && $2 == ":" && $NF == p {
         elif [[ $NODENAME == mia1* ]]; then
             export MORI_RDMA_TC=104
             echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
+        elif [[ $NODENAME == chi-mi325x* ]]; then
+            # Vultr/CPE MI325X: Broadcom Thor 2, DSCP AF31(26)->prio 3, TC=4*26=104
+            export MORI_RDMA_TC=104
+            export MORI_RDMA_SL=3
+            echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL from hostname $NODENAME"
         else
             echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration."
         fi
@@ -114,6 +132,11 @@ else
     elif [[ $NODENAME == mia1* ]]; then
         export MORI_RDMA_TC=104
         echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
+    elif [[ $NODENAME == chi-mi325x* ]]; then
+        # Vultr/CPE MI325X: Broadcom Thor 2, DSCP AF31(26)->prio 3, TC=4*26=104
+        export MORI_RDMA_TC=104
+        export MORI_RDMA_SL=3
+        echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL from hostname $NODENAME"
     else
         echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration."
         echo "       This is normal for clusters without QoS or outside Docker containers."

diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
@@ -30,14 +30,18 @@ if [[ ! -f "$MODELS_YAML" ]]; then
     exit 1
 fi
 
-# Validate MODEL_NAME exists as a top-level key in models.yaml
-if ! grep -q "^${MODEL_NAME}:" "$MODELS_YAML"; then
-    echo "Error: Model '$MODEL_NAME' not found in models.yaml"
+# MODEL_YAML_KEY is the models.yaml lookup key (bare model name, e.g. DeepSeek-R1-0528).
+# MODEL_NAME may be a longer HF cache path (e.g. models--org--repo/snapshots/<hash>).
+_MODEL_YAML_KEY="${MODEL_YAML_KEY:-$MODEL_NAME}"
+
+# Validate the yaml key exists as a top-level key in models.yaml
+if ! grep -q "^${_MODEL_YAML_KEY}:" "$MODELS_YAML"; then
+    echo "Error: Model '$_MODEL_YAML_KEY' not found in models.yaml"
     echo "Available models:"
     grep -E '^[A-Za-z]' "$MODELS_YAML" | sed 's/:.*$//' | sed 's/^/  - /'
     exit 1
 fi
-echo "Model found: $MODEL_NAME"
+echo "Model found: $_MODEL_YAML_KEY"
 
 # All models use server.sh as the entrypoint
 RUN_FILE="server.sh"
@@ -249,10 +253,9 @@ echo "NNODES is ${NNODES}"
 echo "REPO Directory is ${DI_REPO_DIR}"
 echo "USER_NAME is ${USER_NAME}"
 
-# Get the RDMA priority and DSCP value from the NIC
+# Get the RDMA priority and DSCP value from the NIC (optional - env.sh handles absence gracefully)
 if ! command -v nicctl >/dev/null 2>&1; then
-    echo "Error: nicctl command not found. Please ensure nicctl is installed and available." >&2
-    exit 1
+    echo "[INFO] nicctl not found. RDMA QoS configuration will be skipped inside the container." >&2
 fi
 
 # Reduce log spam
@@ -286,7 +289,8 @@ export DRY_RUN="${DRY_RUN:-0}"
 export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}"
 
 SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_')
-export DOCKER_CONT_NAME="container_sbatch_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}"
+SANITIZED_MODEL=$(echo "$MODEL_NAME" | tr -c 'a-zA-Z0-9_.-' '_')
+export DOCKER_CONT_NAME="container_sbatch_${SANITIZED_USER}_${SANITIZED_MODEL}_${SLURM_JOB_ID}"
 export RUN_FILE_FULL="$SGLANG_WS_PATH/${RUN_FILE}"
 
 
@@ -296,8 +300,8 @@ SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,)
 
 cleanup() {
   echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning stale logs folder..."
-  # clean up the logs folder
-  sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true
+  # NFS-safe cleanup: use timeout to avoid hanging on stale NFS locks
+  timeout --kill-after=5 30 sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true
 
   echo "[${SLURM_JOB_ID}] cleanup done."
 }
@@ -318,6 +322,54 @@ srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '
     echo "NFS cache refreshed on $(hostname)"
 '
 
+# =============================================================================
+# Optional: Pre-stage model to local NVMe for faster loading
+# =============================================================================
+# LOCAL_MODEL_CACHE_DIR: mount point for fast local storage (NVMe/SSD) on compute nodes.
+# Set per-cluster via the runner/launch script. When set, model weights are rsync'd
+# from shared storage to local NVMe before Docker starts. This is idempotent —
+# subsequent runs skip files already cached locally.
+#
+# If unset or the local path doesn't exist, the model is served directly from
+# shared storage (NFS/Lustre) as before.
+if [[ -n "${LOCAL_MODEL_CACHE_DIR:-}" ]]; then
+    LOCAL_MODEL_FULL="${LOCAL_MODEL_CACHE_DIR}/${MODEL_NAME}"
+    echo "[cache] Pre-staging model to local NVMe on all nodes..."
+    echo "[cache]   Source: $MODEL_PATH"
+    echo "[cache]   Dest:   $LOCAL_MODEL_FULL"
+
+    srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '
+        set -euo pipefail
+        SRC="'"$MODEL_PATH"'"
+        DST="'"$LOCAL_MODEL_FULL"'"
+        CACHE_DIR="'"${LOCAL_MODEL_CACHE_DIR}"'"
+
+        # Create destination directory
+        sudo mkdir -p "$CACHE_DIR" 2>/dev/null || mkdir -p "$CACHE_DIR"
+        sudo chown -R "$(whoami)" "$CACHE_DIR" 2>/dev/null || true
+
+        echo "[cache] $(hostname): Syncing model to local NVMe..."
+        START=$(date +%s)
+
+        rclone sync "$SRC/" "$DST/" \
+            --transfers 32 \
+            --checkers 32 \
+            --links \
+            --progress
+
+        ELAPSED=$(( $(date +%s) - START ))
+        SIZE=$(du -sh "$DST" 2>/dev/null | cut -f1)
+        echo "[cache] $(hostname): Done in ${ELAPSED}s ($SIZE)"
+    ' 2>&1
+
+    if [[ $? -eq 0 ]]; then
+        echo "[cache] Model pre-staged successfully. Updating MODEL_DIR."
+        MODEL_DIR="${LOCAL_MODEL_CACHE_DIR}"
+    else
+        echo "[cache] WARNING: Local caching failed on some nodes. Falling back to shared storage."
+    fi
+fi
+
 srun \
   --nodelist="$SELECTED_NODELIST_SRUN" \
   --kill-on-bad-exit=1 \
@@ -357,7 +409,7 @@ exec sudo docker run --rm \
     --privileged \
     -v ${MODEL_DIR}:/models \
     -v \$HOME/.ssh:/root/.ssh \
-    -v $(which nicctl):/usr/sbin/nicctl \
+    $(command -v nicctl &>/dev/null && echo "-v $(which nicctl):/usr/sbin/nicctl") \
     --shm-size 128G \
     -v /tmp:/run_logs \
     -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \
@@ -373,6 +425,7 @@ exec sudo docker run --rm \
     -e xP=\$xP \
     -e yD=\$yD \
     -e MODEL_NAME=\$MODEL_NAME \
+    -e MODEL_YAML_KEY=${_MODEL_YAML_KEY} \
     -e IPADDRS=\$IPADDRS \
     -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE \
     -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP \