diff --git a/gpudirect-tcpx/nccl-config.yaml b/gpudirect-tcpx/nccl-config.yaml index 1dd5d71db..4c4da2ce1 100644 --- a/gpudirect-tcpx/nccl-config.yaml +++ b/gpudirect-tcpx/nccl-config.yaml @@ -33,30 +33,25 @@ data: --mca pcompress_base_silence_warning 1 \ -np $(( gpu_per_node * "${nhosts}" )) \ --hostfile "${SCRIPT_DIR}/hostfiles${nhosts}/hostfile${gpu_per_node}" \ - -x LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" \ - -x NCCL_GPUDIRECTTCPX_FORCE_ACK=0 \ - -x NCCL_GPUDIRECTTCPX_TX_COMPLETION_NANOSLEEP=1000 \ - -x NCCL_SOCKET_IFNAME=eth0 \ + -x LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/nvidia/lib64" \ + -x NCCL_SOCKET_IFNAME="eth0" \ + -x NCCL_ALGO=Ring,Tree \ + -x NCCL_PROTO=Simple \ + -x NCCL_CROSS_NIC=0 \ + -x NCCL_NET_GDR_LEVEL=PIX \ + -x NCCL_P2P_PXN_LEVEL=0 \ + -x NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=eth1,eth2,eth3,eth4 \ + -x NCCL_GPUDIRECTTCPX_CTRL_DEV=eth0 \ -x NCCL_DYNAMIC_CHUNK_SIZE=524288 \ -x NCCL_P2P_NET_CHUNKSIZE=524288 \ -x NCCL_P2P_PCI_CHUNKSIZE=524288 \ -x NCCL_P2P_NVL_CHUNKSIZE=1048576 \ - -x NCCL_GPUDIRECTTCPX_TX_BINDINGS="eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177" \ - -x NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191" \ + -x NCCL_BUFFSIZE=4194304 \ -x NCCL_NSOCKS_PERTHREAD=4 \ -x NCCL_SOCKET_NTHREADS=1 \ - -x NCCL_MAX_NCHANNELS=8 \ - -x NCCL_MIN_NCHANNELS=8 \ - -x NCCL_BUFFSIZE=4194304 \ - -x NCCL_DEBUG=INFO -x NCCL_DEBUG_SUBSYS=ENV \ - -x NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=eth1,eth2,eth3,eth4 \ - -x NCCL_GPUDIRECTTCPX_CTRL_DEV=eth0 \ + -x NCCL_GPUDIRECTTCPX_TX_BINDINGS="eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177" \ + -x NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191" \ -x NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=500000 \ - -x NCCL_CROSS_NIC=0 \ - -x NCCL_ALGO=Ring \ - -x NCCL_PROTO=Simple \ - -x NCCL_NET_GDR_LEVEL=PIX \ - -x NCCL_P2P_PXN_LEVEL=0 \ taskset -c 0-7,104-111,52-59,156-163 \ /third_party/nccl-tests-mpi/build/"${benchmark}" \ -b "${data_b}" -e "${data_e}" -f 2 -g 1 -w 5 --iters 100 -c 0 2>&1 \