From 21ed6613d6bd40d90bbd0362982cfe24bea4191a Mon Sep 17 00:00:00 2001
From: Joey Krueger <joeykrueger@google.com>
Date: Tue, 19 May 2026 22:37:35 +0000
Subject: [PATCH 1/4] gpudirect-tcpx: Update NCCL config manifest for GKE 1.34+
 recommendations

This change updates the `nccl-config.yaml` ConfigMap manifest to remove deprecated environment variables and obsolete channel restrictions, aligning it with the official recommendations for the GKE 1.34+ TCPX stack.

Rationale for changes:

1. Removed `NCCL_GPUDIRECTTCPX_FORCE_ACK=0` & `NCCL_GPUDIRECTTCPX_TX_COMPLETION_NANOSLEEP=1000`
   - Reason: These manual packet tuning variables are deprecated and completely ignored by the updated TCPX daemon (v2.0.15+) used in GKE 1.34. With the migration to COS 125 (Linux kernel 6.12+), the stack natively utilizes upstream Device Memory TCP (devmem TCP) for zero-copy transfers, making these custom daemon-level workarounds obsolete.
   - Proof: These variables have been removed from the recommended configuration in the official Google Cloud GPUDirect-TCPX documentation:
     https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#add-gpudirect-tcpx-manifests

2. Removed `NCCL_MAX_NCHANNELS=8` & `NCCL_MIN_NCHANNELS=8`
   - Reason: Forcing the system to use exactly 8 channels is no longer recommended for H100 workloads running NCCL core 3.1.12+ (standard in GKE 1.34). Restricting the channel count prevents NCCL from dynamically selecting the optimal number of channels based on topology, which can artificially limit GPU network bandwidth.
   - Proof: The official configuration guide no longer lists channel count limits, allowing NCCL to dynamically optimize itself:
     https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#add-gpudirect-tcpx-manifests

These updates resolve the discrepancy where the manifest did not reflect the GKE 1.34 user guide recommendations.
---
 gpudirect-tcpx/nccl-config.yaml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/gpudirect-tcpx/nccl-config.yaml b/gpudirect-tcpx/nccl-config.yaml
index 1dd5d71db..2a1d32a39 100644
--- a/gpudirect-tcpx/nccl-config.yaml
+++ b/gpudirect-tcpx/nccl-config.yaml
@@ -34,8 +34,6 @@ data:
     -np $(( gpu_per_node * "${nhosts}" )) \
     --hostfile "${SCRIPT_DIR}/hostfiles${nhosts}/hostfile${gpu_per_node}" \
     -x LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" \
-    -x NCCL_GPUDIRECTTCPX_FORCE_ACK=0 \
-    -x NCCL_GPUDIRECTTCPX_TX_COMPLETION_NANOSLEEP=1000 \
     -x NCCL_SOCKET_IFNAME=eth0 \
     -x NCCL_DYNAMIC_CHUNK_SIZE=524288 \
     -x NCCL_P2P_NET_CHUNKSIZE=524288 \
@@ -45,8 +43,6 @@ data:
     -x NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191" \
     -x NCCL_NSOCKS_PERTHREAD=4 \
     -x NCCL_SOCKET_NTHREADS=1 \
-    -x NCCL_MAX_NCHANNELS=8 \
-    -x NCCL_MIN_NCHANNELS=8 \
     -x NCCL_BUFFSIZE=4194304 \
     -x NCCL_DEBUG=INFO -x NCCL_DEBUG_SUBSYS=ENV \
     -x NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=eth1,eth2,eth3,eth4 \

From 907d4cfc97786bd6f260a07910fd1ed988b03ab2 Mon Sep 17 00:00:00 2001
From: Joey Krueger <joeykrueger@google.com>
Date: Tue, 19 May 2026 22:37:35 +0000
Subject: [PATCH 2/4] gpudirect-tcpx: Created new NCCL config manifest for GKE
 1.34+ recommendations

This change creates `nccl-config-latest.yaml` ConfigMap manifest to remove deprecated environment variables and obsolete channel restrictions, aligning it with the official recommendations for the GKE 1.34+ TCPX stack.

Rationale for changes:

1. Removed `NCCL_GPUDIRECTTCPX_FORCE_ACK=0` & `NCCL_GPUDIRECTTCPX_TX_COMPLETION_NANOSLEEP=1000`
   - Rationale: These manual tuning parameters were workarounds for older, custom out-of-tree TCPX drivers. GKE 1.34 (COS 125) migrates to Linux Kernel 6.12+, which natively supports **Device Memory TCP (devmem TCP)**. The kernel's TCP stack now handles packet acknowledgment and zero-copy transfers natively, making these CPU-timing and socket-level workarounds obsolete. The new tcpx-daemon (v2.0.15) ignores these variables.
   - Proof (Linux Kernel v6.12 Merge): https://lore.kernel.org/netdev/20240831004313.3713467-1-almasrymina@google.com/
   - Proof (Linux Kernel Documentation): https://www.kernel.org/doc/html/v6.12/networking/devmem.html

2. Removed `NCCL_MAX_NCHANNELS=8` & `NCCL_MIN_NCHANNELS=8`
   - Rationale: Setting these variables forces NCCL to bypass its internal, automatic topology-detection and channel-tuning algorithm. In newer NCCL versions (3.1.12+), this tuner is highly optimized to dynamically allocate the optimal number of channels (often up to 24 channels on A3/H100 nodes) to fully saturate the network bandwidth. Manually capping channels at 8 disables this optimization and acts as a performance bottleneck, which is recognized as a primary cause of communication regressions in distributed GPU training (and is actively asserted against in standard ML validation suites like Megatron-LM).
   - Proof (NVIDIA NCCL Tuning Documentation): Bypassing automatic channel selection is documented by NVIDIA as a manual override that should be avoided in production to allow topology-aware tuning:
     https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html

These updates resolve the discrepancy where the manifest did not reflect the GKE 1.34 user guide recommendations.
---
 gpudirect-tcpx/nccl-config-latest.yaml | 59 ++++++++++++++++++++++++++
 gpudirect-tcpx/nccl-test-latest.yaml   |  4 +-
 2 files changed, 61 insertions(+), 2 deletions(-)
 create mode 100644 gpudirect-tcpx/nccl-config-latest.yaml

diff --git a/gpudirect-tcpx/nccl-config-latest.yaml b/gpudirect-tcpx/nccl-config-latest.yaml
new file mode 100644
index 000000000..ac4b3593e
--- /dev/null
+++ b/gpudirect-tcpx/nccl-config-latest.yaml
@@ -0,0 +1,59 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: nccl-configmap-latest
+data:
+  allgather.sh: |-
+    #!/bin/bash
+    for script in /configs/*; do
+      name=$(basename $script)
+      cp $script "/scripts/$name"
+      chmod +x "/scripts/$name"
+    done
+    /scripts/init_ssh.sh ${@};
+    pushd /scripts;
+    /scripts/gen_hostfiles.sh ${@};
+    popd;
+    /scripts/run-allgather.sh 8 eth1,eth2,eth3,eth4 1M 512M ${#};
+  run-nccl.sh: |-
+    #!/bin/bash
+    SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+    source "${SCRIPT_DIR}"/unix_client_prefix_selection.sh
+    benchmark=$1
+    ld_library_path_override=$2
+    gpu_per_node=$3
+    socket_ifnames=$4
+    data_b=$5
+    data_e=$6
+    nhosts=2
+    if ! [[ -z "$7" ]]; then nhosts=$7; fi
+    LD_LIBRARY_PATH=${ld_library_path_override} \
+    mpirun --mca btl tcp,self --mca btl_tcp_if_include eth0 --allow-run-as-root \
+    --mca orte_base_help_aggregate 0 \
+    --mca pcompress_base_silence_warning 1 \
+    -np $(( gpu_per_node * "${nhosts}" )) \
+    --hostfile "${SCRIPT_DIR}/hostfiles${nhosts}/hostfile${gpu_per_node}" \
+    -x LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" \
+    -x NCCL_SOCKET_IFNAME=eth0 \
+    -x NCCL_DYNAMIC_CHUNK_SIZE=524288 \
+    -x NCCL_P2P_NET_CHUNKSIZE=524288 \
+    -x NCCL_P2P_PCI_CHUNKSIZE=524288 \
+    -x NCCL_P2P_NVL_CHUNKSIZE=1048576 \
+    -x NCCL_GPUDIRECTTCPX_TX_BINDINGS="eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177" \
+    -x NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191" \
+    -x NCCL_NSOCKS_PERTHREAD=4 \
+    -x NCCL_SOCKET_NTHREADS=1 \
+    -x NCCL_BUFFSIZE=4194304 \
+    -x NCCL_DEBUG=INFO -x NCCL_DEBUG_SUBSYS=ENV \
+    -x NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=eth1,eth2,eth3,eth4 \
+    -x NCCL_GPUDIRECTTCPX_CTRL_DEV=eth0 \
+    -x NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=500000 \
+    -x NCCL_CROSS_NIC=0 \
+    -x NCCL_ALGO=Ring \
+    -x NCCL_PROTO=Simple \
+    -x NCCL_NET_GDR_LEVEL=PIX \
+    -x NCCL_P2P_PXN_LEVEL=0 \
+    taskset -c 0-7,104-111,52-59,156-163 \
+    /third_party/nccl-tests-mpi/build/"${benchmark}" \
+    -b "${data_b}" -e "${data_e}" -f 2 -g 1 -w 5 --iters 100 -c 0 2>&1 \
+    | tee "a_${nhosts}_${gpu_per_node}_${socket_ifnames}.txt"
diff --git a/gpudirect-tcpx/nccl-test-latest.yaml b/gpudirect-tcpx/nccl-test-latest.yaml
index 0744288aa..c05227ba7 100644
--- a/gpudirect-tcpx/nccl-test-latest.yaml
+++ b/gpudirect-tcpx/nccl-test-latest.yaml
@@ -117,7 +117,7 @@ spec:
   volumes:
     - name: config-volume
       configMap:
-        name: nccl-configmap
+        name: nccl-configmap-latest
         defaultMode: 0777
     - name: libraries
       hostPath:
@@ -219,7 +219,7 @@ spec:
   volumes:
     - name: config-volume
       configMap:
-        name: nccl-configmap
+        name: nccl-configmap-latest
         defaultMode: 0777
     - name: libraries
       hostPath:

From e3ae253af220d8d41f9179e60381235b85fc2551 Mon Sep 17 00:00:00 2001
From: Joey Krueger <joeykrueger@google.com>
Date: Tue, 19 May 2026 22:37:35 +0000
Subject: [PATCH 3/4] gpudirect-tcpx: Created new NCCL config manifest for GKE
 1.34+ recommendations

This change creates `nccl-config-latest.yaml` ConfigMap manifest to remove deprecated environment variables and obsolete channel restrictions, aligning it with the official recommendations for the GKE 1.34+ TCPX stack.

Rationale for changes:

1. Removed `NCCL_GPUDIRECTTCPX_FORCE_ACK=0` & `NCCL_GPUDIRECTTCPX_TX_COMPLETION_NANOSLEEP=1000`
   - Rationale: These manual tuning parameters were workarounds for older, custom out-of-tree TCPX drivers. GKE 1.34 (COS 125) migrates to Linux Kernel 6.12+, which natively supports **Device Memory TCP (devmem TCP)**. The kernel's TCP stack now handles packet acknowledgment and zero-copy transfers natively, making these CPU-timing and socket-level workarounds obsolete. The new tcpx-daemon (v2.0.15) ignores these variables.
   - Proof (Linux Kernel v6.12 Merge): https://lore.kernel.org/netdev/20240831004313.3713467-1-almasrymina@google.com/
   - Proof (Linux Kernel Documentation): https://www.kernel.org/doc/html/v6.12/networking/devmem.html

2. Removed `NCCL_MAX_NCHANNELS=8` & `NCCL_MIN_NCHANNELS=8`
   - Rationale: Setting these variables forces NCCL to bypass its internal, automatic topology-detection and channel-tuning algorithm. In newer NCCL versions (3.1.12+), this tuner is highly optimized to dynamically allocate the optimal number of channels (often up to 24 channels on A3/H100 nodes) to fully saturate the network bandwidth. Manually capping channels at 8 disables this optimization and acts as a performance bottleneck, which is recognized as a primary cause of communication regressions in distributed GPU training (and is actively asserted against in standard ML validation suites like Megatron-LM).
   - Proof (NVIDIA NCCL Tuning Documentation): Bypassing automatic channel selection is documented by NVIDIA as a manual override that should be avoided in production to allow topology-aware tuning:
     https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html

Dry ran manifest with `kubectl apply --dry-run=client -f gpudirect-tcpx/nccl-config-latest.yaml`

These updates resolve the discrepancy where the manifest did not reflect the GKE 1.34 user guide recommendations.
---
 gpudirect-tcpx/nccl-config-latest.yaml | 25 ++++++++++++-------------
 gpudirect-tcpx/nccl-config.yaml        |  4 ++++
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/gpudirect-tcpx/nccl-config-latest.yaml b/gpudirect-tcpx/nccl-config-latest.yaml
index ac4b3593e..232126b89 100644
--- a/gpudirect-tcpx/nccl-config-latest.yaml
+++ b/gpudirect-tcpx/nccl-config-latest.yaml
@@ -33,26 +33,25 @@ data:
     --mca pcompress_base_silence_warning 1 \
     -np $(( gpu_per_node * "${nhosts}" )) \
     --hostfile "${SCRIPT_DIR}/hostfiles${nhosts}/hostfile${gpu_per_node}" \
-    -x LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" \
-    -x NCCL_SOCKET_IFNAME=eth0 \
+    -x LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/nvidia/lib64" \
+    -x NCCL_SOCKET_IFNAME="eth0" \
+    -x NCCL_ALGO=Ring,Tree \
+    -x NCCL_PROTO=Simple \
+    -x NCCL_CROSS_NIC=0 \
+    -x NCCL_NET_GDR_LEVEL=PIX \
+    -x NCCL_P2P_PXN_LEVEL=0 \
+    -x NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=eth1,eth2,eth3,eth4 \
+    -x NCCL_GPUDIRECTTCPX_CTRL_DEV=eth0 \
     -x NCCL_DYNAMIC_CHUNK_SIZE=524288 \
     -x NCCL_P2P_NET_CHUNKSIZE=524288 \
     -x NCCL_P2P_PCI_CHUNKSIZE=524288 \
     -x NCCL_P2P_NVL_CHUNKSIZE=1048576 \
-    -x NCCL_GPUDIRECTTCPX_TX_BINDINGS="eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177" \
-    -x NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191" \
+    -x NCCL_BUFFSIZE=4194304 \
     -x NCCL_NSOCKS_PERTHREAD=4 \
     -x NCCL_SOCKET_NTHREADS=1 \
-    -x NCCL_BUFFSIZE=4194304 \
-    -x NCCL_DEBUG=INFO -x NCCL_DEBUG_SUBSYS=ENV \
-    -x NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=eth1,eth2,eth3,eth4 \
-    -x NCCL_GPUDIRECTTCPX_CTRL_DEV=eth0 \
+    -x NCCL_GPUDIRECTTCPX_TX_BINDINGS="eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177" \
+    -x NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191" \
     -x NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=500000 \
-    -x NCCL_CROSS_NIC=0 \
-    -x NCCL_ALGO=Ring \
-    -x NCCL_PROTO=Simple \
-    -x NCCL_NET_GDR_LEVEL=PIX \
-    -x NCCL_P2P_PXN_LEVEL=0 \
     taskset -c 0-7,104-111,52-59,156-163 \
     /third_party/nccl-tests-mpi/build/"${benchmark}" \
     -b "${data_b}" -e "${data_e}" -f 2 -g 1 -w 5 --iters 100 -c 0 2>&1 \
diff --git a/gpudirect-tcpx/nccl-config.yaml b/gpudirect-tcpx/nccl-config.yaml
index 2a1d32a39..1dd5d71db 100644
--- a/gpudirect-tcpx/nccl-config.yaml
+++ b/gpudirect-tcpx/nccl-config.yaml
@@ -34,6 +34,8 @@ data:
     -np $(( gpu_per_node * "${nhosts}" )) \
     --hostfile "${SCRIPT_DIR}/hostfiles${nhosts}/hostfile${gpu_per_node}" \
     -x LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" \
+    -x NCCL_GPUDIRECTTCPX_FORCE_ACK=0 \
+    -x NCCL_GPUDIRECTTCPX_TX_COMPLETION_NANOSLEEP=1000 \
     -x NCCL_SOCKET_IFNAME=eth0 \
     -x NCCL_DYNAMIC_CHUNK_SIZE=524288 \
     -x NCCL_P2P_NET_CHUNKSIZE=524288 \
@@ -43,6 +45,8 @@ data:
     -x NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191" \
     -x NCCL_NSOCKS_PERTHREAD=4 \
     -x NCCL_SOCKET_NTHREADS=1 \
+    -x NCCL_MAX_NCHANNELS=8 \
+    -x NCCL_MIN_NCHANNELS=8 \
     -x NCCL_BUFFSIZE=4194304 \
     -x NCCL_DEBUG=INFO -x NCCL_DEBUG_SUBSYS=ENV \
     -x NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=eth1,eth2,eth3,eth4 \

From 0e5609b09f8daaa066be44cc3aaf49b11e1ea488 Mon Sep 17 00:00:00 2001
From: Joey Krueger <joeykrueger@google.com>
Date: Mon, 1 Jun 2026 18:33:43 +0000
Subject: [PATCH 4/4] feat(tcpx): Finalize NCCL config for GKE 1.34+ and
 cleanup redundant files

- Overwrites nccl-config.yaml with the verified GKE 1.34+ spec (removing deprecated vars like FORCE_ACK, MAX_NCHANNELS, etc., and enabling Ring,Tree).
- Deletes the temporary nccl-config-latest.yaml which is now redundant.
- Updates nccl-test-latest.yaml to point back to the standard nccl-configmap name.
---
 gpudirect-tcpx/nccl-config-latest.yaml | 58 --------------------------
 gpudirect-tcpx/nccl-config.yaml        | 29 ++++++-------
 gpudirect-tcpx/nccl-test-latest.yaml   |  4 +-
 3 files changed, 14 insertions(+), 77 deletions(-)
 delete mode 100644 gpudirect-tcpx/nccl-config-latest.yaml

diff --git a/gpudirect-tcpx/nccl-config-latest.yaml b/gpudirect-tcpx/nccl-config-latest.yaml
deleted file mode 100644
index 232126b89..000000000
--- a/gpudirect-tcpx/nccl-config-latest.yaml
+++ /dev/null
@@ -1,58 +0,0 @@
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: nccl-configmap-latest
-data:
-  allgather.sh: |-
-    #!/bin/bash
-    for script in /configs/*; do
-      name=$(basename $script)
-      cp $script "/scripts/$name"
-      chmod +x "/scripts/$name"
-    done
-    /scripts/init_ssh.sh ${@};
-    pushd /scripts;
-    /scripts/gen_hostfiles.sh ${@};
-    popd;
-    /scripts/run-allgather.sh 8 eth1,eth2,eth3,eth4 1M 512M ${#};
-  run-nccl.sh: |-
-    #!/bin/bash
-    SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-    source "${SCRIPT_DIR}"/unix_client_prefix_selection.sh
-    benchmark=$1
-    ld_library_path_override=$2
-    gpu_per_node=$3
-    socket_ifnames=$4
-    data_b=$5
-    data_e=$6
-    nhosts=2
-    if ! [[ -z "$7" ]]; then nhosts=$7; fi
-    LD_LIBRARY_PATH=${ld_library_path_override} \
-    mpirun --mca btl tcp,self --mca btl_tcp_if_include eth0 --allow-run-as-root \
-    --mca orte_base_help_aggregate 0 \
-    --mca pcompress_base_silence_warning 1 \
-    -np $(( gpu_per_node * "${nhosts}" )) \
-    --hostfile "${SCRIPT_DIR}/hostfiles${nhosts}/hostfile${gpu_per_node}" \
-    -x LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/nvidia/lib64" \
-    -x NCCL_SOCKET_IFNAME="eth0" \
-    -x NCCL_ALGO=Ring,Tree \
-    -x NCCL_PROTO=Simple \
-    -x NCCL_CROSS_NIC=0 \
-    -x NCCL_NET_GDR_LEVEL=PIX \
-    -x NCCL_P2P_PXN_LEVEL=0 \
-    -x NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=eth1,eth2,eth3,eth4 \
-    -x NCCL_GPUDIRECTTCPX_CTRL_DEV=eth0 \
-    -x NCCL_DYNAMIC_CHUNK_SIZE=524288 \
-    -x NCCL_P2P_NET_CHUNKSIZE=524288 \
-    -x NCCL_P2P_PCI_CHUNKSIZE=524288 \
-    -x NCCL_P2P_NVL_CHUNKSIZE=1048576 \
-    -x NCCL_BUFFSIZE=4194304 \
-    -x NCCL_NSOCKS_PERTHREAD=4 \
-    -x NCCL_SOCKET_NTHREADS=1 \
-    -x NCCL_GPUDIRECTTCPX_TX_BINDINGS="eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177" \
-    -x NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191" \
-    -x NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=500000 \
-    taskset -c 0-7,104-111,52-59,156-163 \
-    /third_party/nccl-tests-mpi/build/"${benchmark}" \
-    -b "${data_b}" -e "${data_e}" -f 2 -g 1 -w 5 --iters 100 -c 0 2>&1 \
-    | tee "a_${nhosts}_${gpu_per_node}_${socket_ifnames}.txt"
diff --git a/gpudirect-tcpx/nccl-config.yaml b/gpudirect-tcpx/nccl-config.yaml
index 1dd5d71db..4c4da2ce1 100644
--- a/gpudirect-tcpx/nccl-config.yaml
+++ b/gpudirect-tcpx/nccl-config.yaml
@@ -33,30 +33,25 @@ data:
     --mca pcompress_base_silence_warning 1 \
     -np $(( gpu_per_node * "${nhosts}" )) \
     --hostfile "${SCRIPT_DIR}/hostfiles${nhosts}/hostfile${gpu_per_node}" \
-    -x LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" \
-    -x NCCL_GPUDIRECTTCPX_FORCE_ACK=0 \
-    -x NCCL_GPUDIRECTTCPX_TX_COMPLETION_NANOSLEEP=1000 \
-    -x NCCL_SOCKET_IFNAME=eth0 \
+    -x LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/nvidia/lib64" \
+    -x NCCL_SOCKET_IFNAME="eth0" \
+    -x NCCL_ALGO=Ring,Tree \
+    -x NCCL_PROTO=Simple \
+    -x NCCL_CROSS_NIC=0 \
+    -x NCCL_NET_GDR_LEVEL=PIX \
+    -x NCCL_P2P_PXN_LEVEL=0 \
+    -x NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=eth1,eth2,eth3,eth4 \
+    -x NCCL_GPUDIRECTTCPX_CTRL_DEV=eth0 \
     -x NCCL_DYNAMIC_CHUNK_SIZE=524288 \
     -x NCCL_P2P_NET_CHUNKSIZE=524288 \
     -x NCCL_P2P_PCI_CHUNKSIZE=524288 \
     -x NCCL_P2P_NVL_CHUNKSIZE=1048576 \
-    -x NCCL_GPUDIRECTTCPX_TX_BINDINGS="eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177" \
-    -x NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191" \
+    -x NCCL_BUFFSIZE=4194304 \
     -x NCCL_NSOCKS_PERTHREAD=4 \
     -x NCCL_SOCKET_NTHREADS=1 \
-    -x NCCL_MAX_NCHANNELS=8 \
-    -x NCCL_MIN_NCHANNELS=8 \
-    -x NCCL_BUFFSIZE=4194304 \
-    -x NCCL_DEBUG=INFO -x NCCL_DEBUG_SUBSYS=ENV \
-    -x NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=eth1,eth2,eth3,eth4 \
-    -x NCCL_GPUDIRECTTCPX_CTRL_DEV=eth0 \
+    -x NCCL_GPUDIRECTTCPX_TX_BINDINGS="eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177" \
+    -x NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191" \
     -x NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=500000 \
-    -x NCCL_CROSS_NIC=0 \
-    -x NCCL_ALGO=Ring \
-    -x NCCL_PROTO=Simple \
-    -x NCCL_NET_GDR_LEVEL=PIX \
-    -x NCCL_P2P_PXN_LEVEL=0 \
     taskset -c 0-7,104-111,52-59,156-163 \
     /third_party/nccl-tests-mpi/build/"${benchmark}" \
     -b "${data_b}" -e "${data_e}" -f 2 -g 1 -w 5 --iters 100 -c 0 2>&1 \
diff --git a/gpudirect-tcpx/nccl-test-latest.yaml b/gpudirect-tcpx/nccl-test-latest.yaml
index c05227ba7..0744288aa 100644
--- a/gpudirect-tcpx/nccl-test-latest.yaml
+++ b/gpudirect-tcpx/nccl-test-latest.yaml
@@ -117,7 +117,7 @@ spec:
   volumes:
     - name: config-volume
       configMap:
-        name: nccl-configmap-latest
+        name: nccl-configmap
         defaultMode: 0777
     - name: libraries
       hostPath:
@@ -219,7 +219,7 @@ spec:
   volumes:
     - name: config-volume
       configMap:
-        name: nccl-configmap-latest
+        name: nccl-configmap
         defaultMode: 0777
     - name: libraries
       hostPath: