From 21ed6613d6bd40d90bbd0362982cfe24bea4191a Mon Sep 17 00:00:00 2001 From: Joey Krueger Date: Tue, 19 May 2026 22:37:35 +0000 Subject: [PATCH 1/4] gpudirect-tcpx: Update NCCL config manifest for GKE 1.34+ recommendations This change updates the `nccl-config.yaml` ConfigMap manifest to remove deprecated environment variables and obsolete channel restrictions, aligning it with the official recommendations for the GKE 1.34+ TCPX stack. Rationale for changes: 1. Removed `NCCL_GPUDIRECTTCPX_FORCE_ACK=0` & `NCCL_GPUDIRECTTCPX_TX_COMPLETION_NANOSLEEP=1000` - Reason: These manual packet tuning variables are deprecated and completely ignored by the updated TCPX daemon (v2.0.15+) used in GKE 1.34. With the migration to COS 125 (Linux kernel 6.12+), the stack natively utilizes upstream Device Memory TCP (devmem TCP) for zero-copy transfers, making these custom daemon-level workarounds obsolete. - Proof: These variables have been removed from the recommended configuration in the official Google Cloud GPUDirect-TCPX documentation: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#add-gpudirect-tcpx-manifests 2. Removed `NCCL_MAX_NCHANNELS=8` & `NCCL_MIN_NCHANNELS=8` - Reason: Forcing the system to use exactly 8 channels is no longer recommended for H100 workloads running NCCL core 3.1.12+ (standard in GKE 1.34). Restricting the channel count prevents NCCL from dynamically selecting the optimal number of channels based on topology, which can artificially limit GPU network bandwidth. - Proof: The official configuration guide no longer lists channel count limits, allowing NCCL to dynamically optimize itself: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#add-gpudirect-tcpx-manifests These updates resolve the discrepancy where the manifest did not reflect the GKE 1.34 user guide recommendations. --- gpudirect-tcpx/nccl-config.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/gpudirect-tcpx/nccl-config.yaml b/gpudirect-tcpx/nccl-config.yaml index 1dd5d71db..2a1d32a39 100644 --- a/gpudirect-tcpx/nccl-config.yaml +++ b/gpudirect-tcpx/nccl-config.yaml @@ -34,8 +34,6 @@ data: -np $(( gpu_per_node * "${nhosts}" )) \ --hostfile "${SCRIPT_DIR}/hostfiles${nhosts}/hostfile${gpu_per_node}" \ -x LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" \ - -x NCCL_GPUDIRECTTCPX_FORCE_ACK=0 \ - -x NCCL_GPUDIRECTTCPX_TX_COMPLETION_NANOSLEEP=1000 \ -x NCCL_SOCKET_IFNAME=eth0 \ -x NCCL_DYNAMIC_CHUNK_SIZE=524288 \ -x NCCL_P2P_NET_CHUNKSIZE=524288 \ @@ -45,8 +43,6 @@ data: -x NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191" \ -x NCCL_NSOCKS_PERTHREAD=4 \ -x NCCL_SOCKET_NTHREADS=1 \ - -x NCCL_MAX_NCHANNELS=8 \ - -x NCCL_MIN_NCHANNELS=8 \ -x NCCL_BUFFSIZE=4194304 \ -x NCCL_DEBUG=INFO -x NCCL_DEBUG_SUBSYS=ENV \ -x NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=eth1,eth2,eth3,eth4 \ From 907d4cfc97786bd6f260a07910fd1ed988b03ab2 Mon Sep 17 00:00:00 2001 From: Joey Krueger Date: Tue, 19 May 2026 22:37:35 +0000 Subject: [PATCH 2/4] gpudirect-tcpx: Created new NCCL config manifest for GKE 1.34+ recommendations This change creates `nccl-config-latest.yaml` ConfigMap manifest to remove deprecated environment variables and obsolete channel restrictions, aligning it with the official recommendations for the GKE 1.34+ TCPX stack. Rationale for changes: 1. Removed `NCCL_GPUDIRECTTCPX_FORCE_ACK=0` & `NCCL_GPUDIRECTTCPX_TX_COMPLETION_NANOSLEEP=1000` - Rationale: These manual tuning parameters were workarounds for older, custom out-of-tree TCPX drivers. GKE 1.34 (COS 125) migrates to Linux Kernel 6.12+, which natively supports **Device Memory TCP (devmem TCP)**. The kernel's TCP stack now handles packet acknowledgment and zero-copy transfers natively, making these CPU-timing and socket-level workarounds obsolete. The new tcpx-daemon (v2.0.15) ignores these variables. - Proof (Linux Kernel v6.12 Merge): https://lore.kernel.org/netdev/20240831004313.3713467-1-almasrymina@google.com/ - Proof (Linux Kernel Documentation): https://www.kernel.org/doc/html/v6.12/networking/devmem.html 2. Removed `NCCL_MAX_NCHANNELS=8` & `NCCL_MIN_NCHANNELS=8` - Rationale: Setting these variables forces NCCL to bypass its internal, automatic topology-detection and channel-tuning algorithm. In newer NCCL versions (3.1.12+), this tuner is highly optimized to dynamically allocate the optimal number of channels (often up to 24 channels on A3/H100 nodes) to fully saturate the network bandwidth. Manually capping channels at 8 disables this optimization and acts as a performance bottleneck, which is recognized as a primary cause of communication regressions in distributed GPU training (and is actively asserted against in standard ML validation suites like Megatron-LM). - Proof (NVIDIA NCCL Tuning Documentation): Bypassing automatic channel selection is documented by NVIDIA as a manual override that should be avoided in production to allow topology-aware tuning: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html These updates resolve the discrepancy where the manifest did not reflect the GKE 1.34 user guide recommendations. --- gpudirect-tcpx/nccl-config-latest.yaml | 59 ++++++++++++++++++++++++++ gpudirect-tcpx/nccl-test-latest.yaml | 4 +- 2 files changed, 61 insertions(+), 2 deletions(-) create mode 100644 gpudirect-tcpx/nccl-config-latest.yaml diff --git a/gpudirect-tcpx/nccl-config-latest.yaml b/gpudirect-tcpx/nccl-config-latest.yaml new file mode 100644 index 000000000..ac4b3593e --- /dev/null +++ b/gpudirect-tcpx/nccl-config-latest.yaml @@ -0,0 +1,59 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: nccl-configmap-latest +data: + allgather.sh: |- + #!/bin/bash + for script in /configs/*; do + name=$(basename $script) + cp $script "/scripts/$name" + chmod +x "/scripts/$name" + done + /scripts/init_ssh.sh ${@}; + pushd /scripts; + /scripts/gen_hostfiles.sh ${@}; + popd; + /scripts/run-allgather.sh 8 eth1,eth2,eth3,eth4 1M 512M ${#}; + run-nccl.sh: |- + #!/bin/bash + SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + source "${SCRIPT_DIR}"/unix_client_prefix_selection.sh + benchmark=$1 + ld_library_path_override=$2 + gpu_per_node=$3 + socket_ifnames=$4 + data_b=$5 + data_e=$6 + nhosts=2 + if ! [[ -z "$7" ]]; then nhosts=$7; fi + LD_LIBRARY_PATH=${ld_library_path_override} \ + mpirun --mca btl tcp,self --mca btl_tcp_if_include eth0 --allow-run-as-root \ + --mca orte_base_help_aggregate 0 \ + --mca pcompress_base_silence_warning 1 \ + -np $(( gpu_per_node * "${nhosts}" )) \ + --hostfile "${SCRIPT_DIR}/hostfiles${nhosts}/hostfile${gpu_per_node}" \ + -x LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" \ + -x NCCL_SOCKET_IFNAME=eth0 \ + -x NCCL_DYNAMIC_CHUNK_SIZE=524288 \ + -x NCCL_P2P_NET_CHUNKSIZE=524288 \ + -x NCCL_P2P_PCI_CHUNKSIZE=524288 \ + -x NCCL_P2P_NVL_CHUNKSIZE=1048576 \ + -x NCCL_GPUDIRECTTCPX_TX_BINDINGS="eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177" \ + -x NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191" \ + -x NCCL_NSOCKS_PERTHREAD=4 \ + -x NCCL_SOCKET_NTHREADS=1 \ + -x NCCL_BUFFSIZE=4194304 \ + -x NCCL_DEBUG=INFO -x NCCL_DEBUG_SUBSYS=ENV \ + -x NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=eth1,eth2,eth3,eth4 \ + -x NCCL_GPUDIRECTTCPX_CTRL_DEV=eth0 \ + -x NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=500000 \ + -x NCCL_CROSS_NIC=0 \ + -x NCCL_ALGO=Ring \ + -x NCCL_PROTO=Simple \ + -x NCCL_NET_GDR_LEVEL=PIX \ + -x NCCL_P2P_PXN_LEVEL=0 \ + taskset -c 0-7,104-111,52-59,156-163 \ + /third_party/nccl-tests-mpi/build/"${benchmark}" \ + -b "${data_b}" -e "${data_e}" -f 2 -g 1 -w 5 --iters 100 -c 0 2>&1 \ + | tee "a_${nhosts}_${gpu_per_node}_${socket_ifnames}.txt" diff --git a/gpudirect-tcpx/nccl-test-latest.yaml b/gpudirect-tcpx/nccl-test-latest.yaml index 0744288aa..c05227ba7 100644 --- a/gpudirect-tcpx/nccl-test-latest.yaml +++ b/gpudirect-tcpx/nccl-test-latest.yaml @@ -117,7 +117,7 @@ spec: volumes: - name: config-volume configMap: - name: nccl-configmap + name: nccl-configmap-latest defaultMode: 0777 - name: libraries hostPath: @@ -219,7 +219,7 @@ spec: volumes: - name: config-volume configMap: - name: nccl-configmap + name: nccl-configmap-latest defaultMode: 0777 - name: libraries hostPath: From e3ae253af220d8d41f9179e60381235b85fc2551 Mon Sep 17 00:00:00 2001 From: Joey Krueger Date: Tue, 19 May 2026 22:37:35 +0000 Subject: [PATCH 3/4] gpudirect-tcpx: Created new NCCL config manifest for GKE 1.34+ recommendations This change creates `nccl-config-latest.yaml` ConfigMap manifest to remove deprecated environment variables and obsolete channel restrictions, aligning it with the official recommendations for the GKE 1.34+ TCPX stack. Rationale for changes: 1. Removed `NCCL_GPUDIRECTTCPX_FORCE_ACK=0` & `NCCL_GPUDIRECTTCPX_TX_COMPLETION_NANOSLEEP=1000` - Rationale: These manual tuning parameters were workarounds for older, custom out-of-tree TCPX drivers. GKE 1.34 (COS 125) migrates to Linux Kernel 6.12+, which natively supports **Device Memory TCP (devmem TCP)**. The kernel's TCP stack now handles packet acknowledgment and zero-copy transfers natively, making these CPU-timing and socket-level workarounds obsolete. The new tcpx-daemon (v2.0.15) ignores these variables. - Proof (Linux Kernel v6.12 Merge): https://lore.kernel.org/netdev/20240831004313.3713467-1-almasrymina@google.com/ - Proof (Linux Kernel Documentation): https://www.kernel.org/doc/html/v6.12/networking/devmem.html 2. Removed `NCCL_MAX_NCHANNELS=8` & `NCCL_MIN_NCHANNELS=8` - Rationale: Setting these variables forces NCCL to bypass its internal, automatic topology-detection and channel-tuning algorithm. In newer NCCL versions (3.1.12+), this tuner is highly optimized to dynamically allocate the optimal number of channels (often up to 24 channels on A3/H100 nodes) to fully saturate the network bandwidth. Manually capping channels at 8 disables this optimization and acts as a performance bottleneck, which is recognized as a primary cause of communication regressions in distributed GPU training (and is actively asserted against in standard ML validation suites like Megatron-LM). - Proof (NVIDIA NCCL Tuning Documentation): Bypassing automatic channel selection is documented by NVIDIA as a manual override that should be avoided in production to allow topology-aware tuning: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html Dry ran manifest with `kubectl apply --dry-run=client -f gpudirect-tcpx/nccl-config-latest.yaml` These updates resolve the discrepancy where the manifest did not reflect the GKE 1.34 user guide recommendations. --- gpudirect-tcpx/nccl-config-latest.yaml | 25 ++++++++++++------------- gpudirect-tcpx/nccl-config.yaml | 4 ++++ 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/gpudirect-tcpx/nccl-config-latest.yaml b/gpudirect-tcpx/nccl-config-latest.yaml index ac4b3593e..232126b89 100644 --- a/gpudirect-tcpx/nccl-config-latest.yaml +++ b/gpudirect-tcpx/nccl-config-latest.yaml @@ -33,26 +33,25 @@ data: --mca pcompress_base_silence_warning 1 \ -np $(( gpu_per_node * "${nhosts}" )) \ --hostfile "${SCRIPT_DIR}/hostfiles${nhosts}/hostfile${gpu_per_node}" \ - -x LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" \ - -x NCCL_SOCKET_IFNAME=eth0 \ + -x LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/nvidia/lib64" \ + -x NCCL_SOCKET_IFNAME="eth0" \ + -x NCCL_ALGO=Ring,Tree \ + -x NCCL_PROTO=Simple \ + -x NCCL_CROSS_NIC=0 \ + -x NCCL_NET_GDR_LEVEL=PIX \ + -x NCCL_P2P_PXN_LEVEL=0 \ + -x NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=eth1,eth2,eth3,eth4 \ + -x NCCL_GPUDIRECTTCPX_CTRL_DEV=eth0 \ -x NCCL_DYNAMIC_CHUNK_SIZE=524288 \ -x NCCL_P2P_NET_CHUNKSIZE=524288 \ -x NCCL_P2P_PCI_CHUNKSIZE=524288 \ -x NCCL_P2P_NVL_CHUNKSIZE=1048576 \ - -x NCCL_GPUDIRECTTCPX_TX_BINDINGS="eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177" \ - -x NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191" \ + -x NCCL_BUFFSIZE=4194304 \ -x NCCL_NSOCKS_PERTHREAD=4 \ -x NCCL_SOCKET_NTHREADS=1 \ - -x NCCL_BUFFSIZE=4194304 \ - -x NCCL_DEBUG=INFO -x NCCL_DEBUG_SUBSYS=ENV \ - -x NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=eth1,eth2,eth3,eth4 \ - -x NCCL_GPUDIRECTTCPX_CTRL_DEV=eth0 \ + -x NCCL_GPUDIRECTTCPX_TX_BINDINGS="eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177" \ + -x NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191" \ -x NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=500000 \ - -x NCCL_CROSS_NIC=0 \ - -x NCCL_ALGO=Ring \ - -x NCCL_PROTO=Simple \ - -x NCCL_NET_GDR_LEVEL=PIX \ - -x NCCL_P2P_PXN_LEVEL=0 \ taskset -c 0-7,104-111,52-59,156-163 \ /third_party/nccl-tests-mpi/build/"${benchmark}" \ -b "${data_b}" -e "${data_e}" -f 2 -g 1 -w 5 --iters 100 -c 0 2>&1 \ diff --git a/gpudirect-tcpx/nccl-config.yaml b/gpudirect-tcpx/nccl-config.yaml index 2a1d32a39..1dd5d71db 100644 --- a/gpudirect-tcpx/nccl-config.yaml +++ b/gpudirect-tcpx/nccl-config.yaml @@ -34,6 +34,8 @@ data: -np $(( gpu_per_node * "${nhosts}" )) \ --hostfile "${SCRIPT_DIR}/hostfiles${nhosts}/hostfile${gpu_per_node}" \ -x LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" \ + -x NCCL_GPUDIRECTTCPX_FORCE_ACK=0 \ + -x NCCL_GPUDIRECTTCPX_TX_COMPLETION_NANOSLEEP=1000 \ -x NCCL_SOCKET_IFNAME=eth0 \ -x NCCL_DYNAMIC_CHUNK_SIZE=524288 \ -x NCCL_P2P_NET_CHUNKSIZE=524288 \ @@ -43,6 +45,8 @@ data: -x NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191" \ -x NCCL_NSOCKS_PERTHREAD=4 \ -x NCCL_SOCKET_NTHREADS=1 \ + -x NCCL_MAX_NCHANNELS=8 \ + -x NCCL_MIN_NCHANNELS=8 \ -x NCCL_BUFFSIZE=4194304 \ -x NCCL_DEBUG=INFO -x NCCL_DEBUG_SUBSYS=ENV \ -x NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=eth1,eth2,eth3,eth4 \ From 0e5609b09f8daaa066be44cc3aaf49b11e1ea488 Mon Sep 17 00:00:00 2001 From: Joey Krueger Date: Mon, 1 Jun 2026 18:33:43 +0000 Subject: [PATCH 4/4] feat(tcpx): Finalize NCCL config for GKE 1.34+ and cleanup redundant files - Overwrites nccl-config.yaml with the verified GKE 1.34+ spec (removing deprecated vars like FORCE_ACK, MAX_NCHANNELS, etc., and enabling Ring,Tree). - Deletes the temporary nccl-config-latest.yaml which is now redundant. - Updates nccl-test-latest.yaml to point back to the standard nccl-configmap name. --- gpudirect-tcpx/nccl-config-latest.yaml | 58 -------------------------- gpudirect-tcpx/nccl-config.yaml | 29 ++++++------- gpudirect-tcpx/nccl-test-latest.yaml | 4 +- 3 files changed, 14 insertions(+), 77 deletions(-) delete mode 100644 gpudirect-tcpx/nccl-config-latest.yaml diff --git a/gpudirect-tcpx/nccl-config-latest.yaml b/gpudirect-tcpx/nccl-config-latest.yaml deleted file mode 100644 index 232126b89..000000000 --- a/gpudirect-tcpx/nccl-config-latest.yaml +++ /dev/null @@ -1,58 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: nccl-configmap-latest -data: - allgather.sh: |- - #!/bin/bash - for script in /configs/*; do - name=$(basename $script) - cp $script "/scripts/$name" - chmod +x "/scripts/$name" - done - /scripts/init_ssh.sh ${@}; - pushd /scripts; - /scripts/gen_hostfiles.sh ${@}; - popd; - /scripts/run-allgather.sh 8 eth1,eth2,eth3,eth4 1M 512M ${#}; - run-nccl.sh: |- - #!/bin/bash - SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) - source "${SCRIPT_DIR}"/unix_client_prefix_selection.sh - benchmark=$1 - ld_library_path_override=$2 - gpu_per_node=$3 - socket_ifnames=$4 - data_b=$5 - data_e=$6 - nhosts=2 - if ! [[ -z "$7" ]]; then nhosts=$7; fi - LD_LIBRARY_PATH=${ld_library_path_override} \ - mpirun --mca btl tcp,self --mca btl_tcp_if_include eth0 --allow-run-as-root \ - --mca orte_base_help_aggregate 0 \ - --mca pcompress_base_silence_warning 1 \ - -np $(( gpu_per_node * "${nhosts}" )) \ - --hostfile "${SCRIPT_DIR}/hostfiles${nhosts}/hostfile${gpu_per_node}" \ - -x LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/nvidia/lib64" \ - -x NCCL_SOCKET_IFNAME="eth0" \ - -x NCCL_ALGO=Ring,Tree \ - -x NCCL_PROTO=Simple \ - -x NCCL_CROSS_NIC=0 \ - -x NCCL_NET_GDR_LEVEL=PIX \ - -x NCCL_P2P_PXN_LEVEL=0 \ - -x NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=eth1,eth2,eth3,eth4 \ - -x NCCL_GPUDIRECTTCPX_CTRL_DEV=eth0 \ - -x NCCL_DYNAMIC_CHUNK_SIZE=524288 \ - -x NCCL_P2P_NET_CHUNKSIZE=524288 \ - -x NCCL_P2P_PCI_CHUNKSIZE=524288 \ - -x NCCL_P2P_NVL_CHUNKSIZE=1048576 \ - -x NCCL_BUFFSIZE=4194304 \ - -x NCCL_NSOCKS_PERTHREAD=4 \ - -x NCCL_SOCKET_NTHREADS=1 \ - -x NCCL_GPUDIRECTTCPX_TX_BINDINGS="eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177" \ - -x NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191" \ - -x NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=500000 \ - taskset -c 0-7,104-111,52-59,156-163 \ - /third_party/nccl-tests-mpi/build/"${benchmark}" \ - -b "${data_b}" -e "${data_e}" -f 2 -g 1 -w 5 --iters 100 -c 0 2>&1 \ - | tee "a_${nhosts}_${gpu_per_node}_${socket_ifnames}.txt" diff --git a/gpudirect-tcpx/nccl-config.yaml b/gpudirect-tcpx/nccl-config.yaml index 1dd5d71db..4c4da2ce1 100644 --- a/gpudirect-tcpx/nccl-config.yaml +++ b/gpudirect-tcpx/nccl-config.yaml @@ -33,30 +33,25 @@ data: --mca pcompress_base_silence_warning 1 \ -np $(( gpu_per_node * "${nhosts}" )) \ --hostfile "${SCRIPT_DIR}/hostfiles${nhosts}/hostfile${gpu_per_node}" \ - -x LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" \ - -x NCCL_GPUDIRECTTCPX_FORCE_ACK=0 \ - -x NCCL_GPUDIRECTTCPX_TX_COMPLETION_NANOSLEEP=1000 \ - -x NCCL_SOCKET_IFNAME=eth0 \ + -x LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/nvidia/lib64" \ + -x NCCL_SOCKET_IFNAME="eth0" \ + -x NCCL_ALGO=Ring,Tree \ + -x NCCL_PROTO=Simple \ + -x NCCL_CROSS_NIC=0 \ + -x NCCL_NET_GDR_LEVEL=PIX \ + -x NCCL_P2P_PXN_LEVEL=0 \ + -x NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=eth1,eth2,eth3,eth4 \ + -x NCCL_GPUDIRECTTCPX_CTRL_DEV=eth0 \ -x NCCL_DYNAMIC_CHUNK_SIZE=524288 \ -x NCCL_P2P_NET_CHUNKSIZE=524288 \ -x NCCL_P2P_PCI_CHUNKSIZE=524288 \ -x NCCL_P2P_NVL_CHUNKSIZE=1048576 \ - -x NCCL_GPUDIRECTTCPX_TX_BINDINGS="eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177" \ - -x NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191" \ + -x NCCL_BUFFSIZE=4194304 \ -x NCCL_NSOCKS_PERTHREAD=4 \ -x NCCL_SOCKET_NTHREADS=1 \ - -x NCCL_MAX_NCHANNELS=8 \ - -x NCCL_MIN_NCHANNELS=8 \ - -x NCCL_BUFFSIZE=4194304 \ - -x NCCL_DEBUG=INFO -x NCCL_DEBUG_SUBSYS=ENV \ - -x NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=eth1,eth2,eth3,eth4 \ - -x NCCL_GPUDIRECTTCPX_CTRL_DEV=eth0 \ + -x NCCL_GPUDIRECTTCPX_TX_BINDINGS="eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177" \ + -x NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191" \ -x NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=500000 \ - -x NCCL_CROSS_NIC=0 \ - -x NCCL_ALGO=Ring \ - -x NCCL_PROTO=Simple \ - -x NCCL_NET_GDR_LEVEL=PIX \ - -x NCCL_P2P_PXN_LEVEL=0 \ taskset -c 0-7,104-111,52-59,156-163 \ /third_party/nccl-tests-mpi/build/"${benchmark}" \ -b "${data_b}" -e "${data_e}" -f 2 -g 1 -w 5 --iters 100 -c 0 2>&1 \ diff --git a/gpudirect-tcpx/nccl-test-latest.yaml b/gpudirect-tcpx/nccl-test-latest.yaml index c05227ba7..0744288aa 100644 --- a/gpudirect-tcpx/nccl-test-latest.yaml +++ b/gpudirect-tcpx/nccl-test-latest.yaml @@ -117,7 +117,7 @@ spec: volumes: - name: config-volume configMap: - name: nccl-configmap-latest + name: nccl-configmap defaultMode: 0777 - name: libraries hostPath: @@ -219,7 +219,7 @@ spec: volumes: - name: config-volume configMap: - name: nccl-configmap-latest + name: nccl-configmap defaultMode: 0777 - name: libraries hostPath: