Mock Version: 3.5 Mock Version: 3.5 Mock Version: 3.5 ENTER ['do_with_status'](['bash', '--login', '-c', '/usr/bin/rpmbuild -bs --noclean --target x86_64 --nodeps /builddir/build/SPECS/libnccl.spec'], chrootPath='/var/lib/mock/dist-an23-epao-build-295073-65782/root'env={'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'}shell=Falselogger=timeout=86400uid=990gid=135user='mockbuild'nspawn_args=[]unshare_net=TrueprintOutput=False) Executing command: ['bash', '--login', '-c', '/usr/bin/rpmbuild -bs --noclean --target x86_64 --nodeps /builddir/build/SPECS/libnccl.spec'] with env {'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'} and shell False Building target platforms: x86_64 Building for target x86_64 setting SOURCE_DATE_EPOCH=1692662400 Wrote: /builddir/build/SRPMS/libnccl-2.18.3-2.cuda12.1.an23.src.rpm Child return code was: 0 ENTER ['do_with_status'](['bash', '--login', '-c', '/usr/bin/rpmbuild -bb --noclean --target x86_64 --nodeps /builddir/build/SPECS/libnccl.spec'], chrootPath='/var/lib/mock/dist-an23-epao-build-295073-65782/root'env={'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'}shell=Falselogger=timeout=86400uid=990gid=135user='mockbuild'nspawn_args=[]unshare_net=TrueprintOutput=False) Executing command: ['bash', '--login', '-c', '/usr/bin/rpmbuild -bb --noclean --target x86_64 --nodeps /builddir/build/SPECS/libnccl.spec'] with env {'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'} and shell False Building target platforms: x86_64 Building for target x86_64 setting SOURCE_DATE_EPOCH=1692662400 Executing(%prep): /bin/sh -e /var/tmp/rpm-tmp.EVMoYg + umask 022 + cd /builddir/build/BUILD + cd /builddir/build/BUILD + rm -rf nccl-2.18.3-1 + /usr/lib/rpm/rpmuncompress -x /builddir/build/SOURCES/nccl-2.18.3-1.tar.gz + STATUS=0 + '[' 0 -ne 0 ']' + cd nccl-2.18.3-1 + /usr/bin/chmod -Rf a+rX,u+w,g-w,o-w . + /usr/lib/rpm/rpmuncompress /builddir/build/SOURCES/0001-fix-lib-path-in-nccl.pc.patch + /usr/bin/patch -p1 -s --fuzz=0 --no-backup-if-mismatch -f + RPM_EC=0 ++ jobs -p + exit 0 Executing(%build): /bin/sh -e /var/tmp/rpm-tmp.bbP2H6 + umask 022 + cd /builddir/build/BUILD + CFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection' + export CFLAGS + CXXFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection' + export CXXFLAGS + FFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -I/usr/lib64/gfortran/modules' + export FFLAGS + FCFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -I/usr/lib64/gfortran/modules' + export FCFLAGS + LDFLAGS='-Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/anolis/anolis-hardened-ld -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -Wl,--build-id=sha1 ' + export LDFLAGS + LT_SYS_LIBRARY_PATH=/usr/lib64: + export LT_SYS_LIBRARY_PATH + CC=gcc + export CC + CXX=g++ + export CXX + cd nccl-2.18.3-1 + export LD_LIBRARY_PATH=/usr/local/cuda-12-1/lib64 + LD_LIBRARY_PATH=/usr/local/cuda-12-1/lib64 + export 'CFLAGS=usr/local/cuda-12-1/include:-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection' + CFLAGS='usr/local/cuda-12-1/include:-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection' + export PREFIX=/usr + PREFIX=/usr + /usr/bin/make -O -j80 V=1 VERBOSE=1 /usr/bin/make -C src build BUILDDIR=/builddir/build/BUILD/nccl-2.18.3-1/build make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' NVCC_GENCODE is -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Grabbing include/nccl_net.h > /builddir/build/BUILD/nccl-2.18.3-1/build/include/nccl_net.h mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/include install -m 644 include/nccl_net.h /builddir/build/BUILD/nccl-2.18.3-1/build/include/nccl_net.h make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/include Generating nccl.h.in > /builddir/build/BUILD/nccl-2.18.3-1/build/include/nccl.h sed -e "s/\${nccl:Major}/2/g" \ -e "s/\${nccl:Minor}/18/g" \ -e "s/\${nccl:Patch}/3/g" \ -e "s/\${nccl:Suffix}//g" \ -e "s/\${nccl:Version}/21803/g" \ nccl.h.in > /builddir/build/BUILD/nccl-2.18.3-1/build/include/nccl.h make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/lib/pkgconfig Generating nccl.pc.in > /builddir/build/BUILD/nccl-2.18.3-1/build/lib/pkgconfig/nccl.pc sed -e 's|${nccl:Prefix}|\/usr|g' \ -e "s/\${nccl:Major}/2/g" \ -e "s/\${nccl:Minor}/18/g" \ -e "s/\${nccl:Patch}/3/g" \ nccl.pc.in > /builddir/build/BUILD/nccl-2.18.3-1/build/lib/pkgconfig/nccl.pc make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' /usr/bin/make -C collectives/device make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling enhcompat.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/enhcompat.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/enhcompat.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c enhcompat.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/enhcompat.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' NVCC_GENCODE is -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Generating rules > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/Makefile.rules make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling graph/trees.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/trees.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/trees.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/trees.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/trees.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' NVCC_GENCODE is -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_i8.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_u8.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_i32.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_u32.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_i64.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_u64.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_f16.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_f32.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_f64.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_bf16.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_i8.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_u8.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_i32.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_u32.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_i64.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_u64.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_f16.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_f32.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_f64.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_bf16.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_i8.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_u8.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_i32.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_u32.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_i64.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_u64.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_f16.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_f32.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_f64.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_bf16.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_i8.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_u8.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_i32.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_u32.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_i64.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_u64.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_f16.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_f32.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_f64.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_bf16.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_i8.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_u8.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_i32.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_u32.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_i64.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_u64.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_f16.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_f32.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_f64.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_bf16.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_i8.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_u8.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_i32.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_u32.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_i64.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_u64.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_f16.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_f32.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_f64.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_bf16.cu cp sendrecv.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_i8.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_u8.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_i32.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_u32.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_u64.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_i64.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_f16.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_f32.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_f64.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_bf16.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_i8.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_u8.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_i32.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_u32.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_i64.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_u64.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_f16.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_f32.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_f64.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_bf16.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_i8.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_u8.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_i32.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_u32.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_i64.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_u64.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_f16.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_f32.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_f64.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_bf16.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_i8.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_u8.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_i32.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_u32.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_i64.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_u64.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_f16.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_f32.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_f64.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_bf16.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_i8.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_u8.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_i32.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_u32.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_i64.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_u64.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_f16.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_f32.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_f64.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_bf16.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_i8.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_u8.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_i32.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_u32.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_i64.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_u64.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_f16.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_f32.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_f64.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_bf16.cu cp all_reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_i8.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_u8.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_i32.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_u32.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_i64.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_u64.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_f16.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_f32.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_f64.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_bf16.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_i8.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_u8.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_i32.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_u32.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_i64.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_u64.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_f16.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_f32.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_f64.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_bf16.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_u8.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_i8.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_i32.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_u32.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_i64.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_u64.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_f16.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_f32.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_f64.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_bf16.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_i8.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_u8.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_i32.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_u32.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_i64.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_u64.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling misc/cudawrap.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/cudawrap.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/cudawrap.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/cudawrap.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/cudawrap.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_f16.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling misc/param.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/param.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/param.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/param.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/param.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_f32.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_f64.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_bf16.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_i8.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_u8.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_i32.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_u32.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_i64.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_u64.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_f16.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_f32.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_f64.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_bf16.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_i8.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_u8.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_i32.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_u32.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_i64.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_u64.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_f16.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_f32.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_f64.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_bf16.cu cp all_gather.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_i8.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_u8.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_i32.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_u32.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_i64.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_u64.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_f16.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_f32.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_f64.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_bf16.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_i8.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_u8.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_i32.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_u32.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_i64.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling misc/ipcsocket.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/ipcsocket.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/ipcsocket.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/ipcsocket.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/ipcsocket.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_u64.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_f16.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_f32.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_f64.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_bf16.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_i8.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_u8.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_i32.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_u32.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_i64.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_u64.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_f16.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_f32.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_f64.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_bf16.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_i8.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_u8.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_i32.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_u32.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_i64.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_u64.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling misc/shmutils.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/shmutils.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/shmutils.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/shmutils.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/shmutils.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_f16.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_f32.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_f64.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_bf16.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_i8.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_u8.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_i32.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_u32.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_i64.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_u64.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_f16.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_f32.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_f64.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_bf16.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_i8.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_u8.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_i32.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_u32.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_i64.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_u64.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_f16.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_f32.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_f64.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_bf16.cu cp broadcast.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_i8.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_u8.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_i32.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_u32.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_i64.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_u64.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_f16.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_f32.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_f64.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_bf16.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_i8.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_u8.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_i32.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_u32.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_i64.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_u64.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_f16.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_f32.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_f64.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_bf16.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_i8.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_u8.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_i32.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_u32.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_i64.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_u64.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_f16.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_f32.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_f64.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_bf16.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_i8.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_u8.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_i32.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_u32.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_i64.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_u64.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_f16.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_f32.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_f64.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_i8.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_bf16.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_u8.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_i32.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_u32.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_i64.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_u64.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_f16.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_f32.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_f64.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_bf16.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_i8.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_u8.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_i32.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_u32.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_i64.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_u64.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_f16.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_f32.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_f64.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling misc/strongstream.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/strongstream.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/strongstream.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/strongstream.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/strongstream.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_bf16.cu cp reduce.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_i8.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_u8.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_i32.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_u32.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_i64.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_u64.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_f16.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_f32.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_bf16.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_f64.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_i8.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_u8.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_i32.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_u32.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_i64.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_u64.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_f16.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_f32.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_f64.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_bf16.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_i8.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_u8.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_i32.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_u32.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_i64.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_u64.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_f16.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_f32.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_f64.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_bf16.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_i8.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_u8.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_i32.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_u32.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_i64.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_u64.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_f16.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_f32.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_f64.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_bf16.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_i8.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_u8.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_i32.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_u32.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_i64.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_u64.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_f16.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_f32.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_f64.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_bf16.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_i8.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_i8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_u8.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_u8.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_i32.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_i32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_u32.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_u32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_i64.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_i64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_u64.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_u64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_f16.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_f16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_f32.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_f32.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_f64.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_f64.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Copying reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_bf16.cu cp reduce_scatter.cu /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_bf16.cu make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling misc/nvmlwrap.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/nvmlwrap.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/nvmlwrap.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/nvmlwrap.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/nvmlwrap.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling misc/socket.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/socket.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/socket.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/socket.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/socket.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling debug.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/debug.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/debug.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c debug.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/debug.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/core.h:62, from debug.cc:7: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling graph/rings.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/rings.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/rings.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/rings.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/rings.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/core.h:62, from graph/rings.cc:7: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ graph/rings.cc: In function 'ncclResult_t ncclBuildRings(int, int*, int, int, int*, int*)': graph/rings.cc:22:80: warning: unused parameter 'prev' [-Wunused-parameter] 22 | ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) { | ~~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling misc/ibvwrap.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/ibvwrap.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/ibvwrap.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/ibvwrap.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/ibvwrap.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/core.h:62, from include/ibvwrap.h:21, from misc/ibvwrap.cc:7: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling collectives/reduce_scatter.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/reduce_scatter.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/reduce_scatter.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c collectives/reduce_scatter.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/reduce_scatter.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from include/enqueue.h:10, from collectives/reduce_scatter.cc:7: include/devcomm.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/devcomm.h:350:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 350 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/devcomm.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/devcomm.h:351:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 351 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:62, from include/info.h:13, from include/graph.h:114, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ collectives/reduce_scatter.cc: In function 'ncclResult_t ncclReduceScatter(const void*, void*, size_t, ncclDataType_t, ncclRedOp_t, ncclComm*, cudaStream_t)': collectives/reduce_scatter.cc:23:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::description' [-Wmissing-field-initializers] 23 | }; | ^ collectives/reduce_scatter.cc:23:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::arrayOrUnionDetail' [-Wmissing-field-initializers] collectives/reduce_scatter.cc:23:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::offset' [-Wmissing-field-initializers] collectives/reduce_scatter.cc:23:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives/reduce_scatter.cc:23:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives/reduce_scatter.cc:23:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives/reduce_scatter.cc:23:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives/reduce_scatter.cc:29:56: warning: missing initializer for member 'ncclInfo::opFull' [-Wmissing-field-initializers] 29 | REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS }; | ^ collectives/reduce_scatter.cc:29:56: warning: missing initializer for member 'ncclInfo::algorithm' [-Wmissing-field-initializers] collectives/reduce_scatter.cc:29:56: warning: missing initializer for member 'ncclInfo::protocol' [-Wmissing-field-initializers] collectives/reduce_scatter.cc:29:56: warning: missing initializer for member 'ncclInfo::pattern' [-Wmissing-field-initializers] collectives/reduce_scatter.cc:29:56: warning: missing initializer for member 'ncclInfo::nChannels' [-Wmissing-field-initializers] collectives/reduce_scatter.cc:29:56: warning: missing initializer for member 'ncclInfo::nThreads' [-Wmissing-field-initializers] collectives/reduce_scatter.cc:29:56: warning: missing initializer for member 'ncclInfo::nBytes' [-Wmissing-field-initializers] collectives/reduce_scatter.cc:29:56: warning: missing initializer for member 'ncclInfo::nstepsPerLoop' [-Wmissing-field-initializers] collectives/reduce_scatter.cc:29:56: warning: missing initializer for member 'ncclInfo::nchunksPerLoop' [-Wmissing-field-initializers] collectives/reduce_scatter.cc:29:56: warning: missing initializer for member 'ncclInfo::chunkSize' [-Wmissing-field-initializers] collectives/reduce_scatter.cc:29:56: warning: missing initializer for member 'ncclInfo::channelId' [-Wmissing-field-initializers] make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling net.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/net.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/net.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c net.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/net.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from include/net.h:12, from net.cc:1: include/devcomm.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/devcomm.h:350:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 350 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/devcomm.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/devcomm.h:351:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 351 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:62, from include/info.h:13, from include/graph.h:114, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ net.cc: In function 'ncclResult_t ncclNet_v4_as_v6_isend(void*, void*, int, int, void*, void**)': net.cc:37:86: warning: unused parameter 'tag' [-Wunused-parameter] 37 | static ncclResult_t ncclNet_v4_as_v6_isend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { | ~~~~^~~ net.cc: In function 'ncclResult_t ncclNet_v4_as_v6_irecv(void*, int, void**, int*, int*, void**, void**)': net.cc:41:97: warning: unused parameter 'tags' [-Wunused-parameter] 41 | static ncclResult_t ncclNet_v4_as_v6_irecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { | ~~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling misc/utils.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/utils.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/utils.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/utils.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/utils.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/core.h:62, from misc/utils.cc:8: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling misc/argcheck.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/argcheck.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/argcheck.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/argcheck.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/argcheck.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/core.h:62, from include/argcheck.h:10, from misc/argcheck.cc:7: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from include/info.h:11, from include/argcheck.h:11: include/devcomm.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/devcomm.h:350:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 350 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/devcomm.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/devcomm.h:351:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 351 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling transport.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from transport.cc:7: include/devcomm.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/devcomm.h:350:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 350 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/devcomm.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/devcomm.h:351:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 351 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:62, from include/info.h:13, from include/graph.h:114, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from include/core.h:59: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclConnect; size_t = long unsigned int]': transport.cc:252:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclTransportCollNetSetup(ncclComm*, ncclTopoGraph*, ncclChannel*, int, int, int, int)::; size_t = long unsigned int]': transport.cc:255:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling graph/tuning.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/tuning.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/tuning.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/tuning.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/tuning.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/core.h:62, from graph/tuning.cc:7: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from graph/tuning.cc:8: include/devcomm.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/devcomm.h:350:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 350 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/devcomm.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/devcomm.h:351:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 351 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling collectives/all_reduce.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/all_reduce.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/all_reduce.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c collectives/all_reduce.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/all_reduce.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from include/enqueue.h:10, from collectives/all_reduce.cc:7: include/devcomm.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/devcomm.h:350:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 350 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/devcomm.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/devcomm.h:351:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 351 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:62, from include/info.h:13, from include/graph.h:114, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ collectives/all_reduce.cc: In function 'ncclResult_t ncclAllReduce(const void*, void*, size_t, ncclDataType_t, ncclRedOp_t, ncclComm*, cudaStream_t)': collectives/all_reduce.cc:23:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::description' [-Wmissing-field-initializers] 23 | }; | ^ collectives/all_reduce.cc:23:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::arrayOrUnionDetail' [-Wmissing-field-initializers] collectives/all_reduce.cc:23:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::offset' [-Wmissing-field-initializers] collectives/all_reduce.cc:23:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives/all_reduce.cc:23:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives/all_reduce.cc:23:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives/all_reduce.cc:23:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives/all_reduce.cc:29:48: warning: missing initializer for member 'ncclInfo::opFull' [-Wmissing-field-initializers] 29 | ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS }; | ^ collectives/all_reduce.cc:29:48: warning: missing initializer for member 'ncclInfo::algorithm' [-Wmissing-field-initializers] collectives/all_reduce.cc:29:48: warning: missing initializer for member 'ncclInfo::protocol' [-Wmissing-field-initializers] collectives/all_reduce.cc:29:48: warning: missing initializer for member 'ncclInfo::pattern' [-Wmissing-field-initializers] collectives/all_reduce.cc:29:48: warning: missing initializer for member 'ncclInfo::nChannels' [-Wmissing-field-initializers] collectives/all_reduce.cc:29:48: warning: missing initializer for member 'ncclInfo::nThreads' [-Wmissing-field-initializers] collectives/all_reduce.cc:29:48: warning: missing initializer for member 'ncclInfo::nBytes' [-Wmissing-field-initializers] collectives/all_reduce.cc:29:48: warning: missing initializer for member 'ncclInfo::nstepsPerLoop' [-Wmissing-field-initializers] collectives/all_reduce.cc:29:48: warning: missing initializer for member 'ncclInfo::nchunksPerLoop' [-Wmissing-field-initializers] collectives/all_reduce.cc:29:48: warning: missing initializer for member 'ncclInfo::chunkSize' [-Wmissing-field-initializers] collectives/all_reduce.cc:29:48: warning: missing initializer for member 'ncclInfo::channelId' [-Wmissing-field-initializers] make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling graph/connect.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/connect.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/connect.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/connect.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/connect.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from graph/connect.cc:7: include/devcomm.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/devcomm.h:350:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 350 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/devcomm.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/devcomm.h:351:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 351 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:62, from include/info.h:13, from include/graph.h:114, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ graph/connect.cc: In function 'ncclResult_t connectTrees(ncclComm*, int*, int*, int*, int*)': graph/connect.cc:130:119: warning: unused parameter 'treePatterns' [-Wunused-parameter] 130 | static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* treePatterns) { | ~~~~~^~~~~~~~~~~~ In file included from include/core.h:59: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = int; size_t = long unsigned int]': graph/connect.cc:174:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling init_nvtx.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/init_nvtx.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/init_nvtx.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c init_nvtx.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/init_nvtx.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from init_nvtx.cc:2: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ init_nvtx.cc:10:1: warning: missing initializer for member 'nvtxPayloadEnum_t::isFlag' [-Wmissing-field-initializers] 10 | }; | ^ init_nvtx.cc:10:1: warning: missing initializer for member 'nvtxPayloadEnum_t::isFlag' [-Wmissing-field-initializers] init_nvtx.cc:10:1: warning: missing initializer for member 'nvtxPayloadEnum_t::isFlag' [-Wmissing-field-initializers] init_nvtx.cc:10:1: warning: missing initializer for member 'nvtxPayloadEnum_t::isFlag' [-Wmissing-field-initializers] init_nvtx.cc:10:1: warning: missing initializer for member 'nvtxPayloadEnum_t::isFlag' [-Wmissing-field-initializers] make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling collectives/sendrecv.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/sendrecv.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/sendrecv.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c collectives/sendrecv.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/sendrecv.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from include/enqueue.h:10, from collectives/sendrecv.cc:7: include/devcomm.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/devcomm.h:350:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 350 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/devcomm.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/devcomm.h:351:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 351 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:62, from include/info.h:13, from include/graph.h:114, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ collectives/sendrecv.cc:18:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::description' [-Wmissing-field-initializers] 18 | }; | ^ collectives/sendrecv.cc:18:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::arrayOrUnionDetail' [-Wmissing-field-initializers] collectives/sendrecv.cc:18:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::offset' [-Wmissing-field-initializers] collectives/sendrecv.cc:18:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives/sendrecv.cc:18:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives/sendrecv.cc:18:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives/sendrecv.cc:18:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives/sendrecv.cc: In function 'ncclResult_t ncclSend(const void*, size_t, ncclDataType_t, int, ncclComm_t, cudaStream_t)': collectives/sendrecv.cc:29:10: warning: missing initializer for member 'ncclInfo::opFull' [-Wmissing-field-initializers] 29 | 1, 1 }; | ^ collectives/sendrecv.cc:29:10: warning: missing initializer for member 'ncclInfo::algorithm' [-Wmissing-field-initializers] collectives/sendrecv.cc:29:10: warning: missing initializer for member 'ncclInfo::protocol' [-Wmissing-field-initializers] collectives/sendrecv.cc:29:10: warning: missing initializer for member 'ncclInfo::pattern' [-Wmissing-field-initializers] collectives/sendrecv.cc:29:10: warning: missing initializer for member 'ncclInfo::nChannels' [-Wmissing-field-initializers] collectives/sendrecv.cc:29:10: warning: missing initializer for member 'ncclInfo::nThreads' [-Wmissing-field-initializers] collectives/sendrecv.cc:29:10: warning: missing initializer for member 'ncclInfo::nBytes' [-Wmissing-field-initializers] collectives/sendrecv.cc:29:10: warning: missing initializer for member 'ncclInfo::nstepsPerLoop' [-Wmissing-field-initializers] collectives/sendrecv.cc:29:10: warning: missing initializer for member 'ncclInfo::nchunksPerLoop' [-Wmissing-field-initializers] collectives/sendrecv.cc:29:10: warning: missing initializer for member 'ncclInfo::chunkSize' [-Wmissing-field-initializers] collectives/sendrecv.cc:29:10: warning: missing initializer for member 'ncclInfo::channelId' [-Wmissing-field-initializers] collectives/sendrecv.cc: In function 'ncclResult_t ncclRecv(void*, size_t, ncclDataType_t, int, ncclComm_t, cudaStream_t)': collectives/sendrecv.cc:46:10: warning: missing initializer for member 'ncclInfo::opFull' [-Wmissing-field-initializers] 46 | 1, 1 }; | ^ collectives/sendrecv.cc:46:10: warning: missing initializer for member 'ncclInfo::algorithm' [-Wmissing-field-initializers] collectives/sendrecv.cc:46:10: warning: missing initializer for member 'ncclInfo::protocol' [-Wmissing-field-initializers] collectives/sendrecv.cc:46:10: warning: missing initializer for member 'ncclInfo::pattern' [-Wmissing-field-initializers] collectives/sendrecv.cc:46:10: warning: missing initializer for member 'ncclInfo::nChannels' [-Wmissing-field-initializers] collectives/sendrecv.cc:46:10: warning: missing initializer for member 'ncclInfo::nThreads' [-Wmissing-field-initializers] collectives/sendrecv.cc:46:10: warning: missing initializer for member 'ncclInfo::nBytes' [-Wmissing-field-initializers] collectives/sendrecv.cc:46:10: warning: missing initializer for member 'ncclInfo::nstepsPerLoop' [-Wmissing-field-initializers] collectives/sendrecv.cc:46:10: warning: missing initializer for member 'ncclInfo::nchunksPerLoop' [-Wmissing-field-initializers] collectives/sendrecv.cc:46:10: warning: missing initializer for member 'ncclInfo::chunkSize' [-Wmissing-field-initializers] collectives/sendrecv.cc:46:10: warning: missing initializer for member 'ncclInfo::channelId' [-Wmissing-field-initializers] make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling misc/ibvsymbols.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/ibvsymbols.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/ibvsymbols.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/ibvsymbols.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/ibvsymbols.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/core.h:62, from misc/ibvsymbols.cc:64: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling graph/paths.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/paths.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/paths.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/paths.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/paths.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/core.h:62, from graph/paths.cc:7: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from include/graph.h:11, from graph/paths.cc:8: include/devcomm.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/devcomm.h:350:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 350 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/devcomm.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/devcomm.h:351:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 351 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:59: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclTopoLinkList; size_t = long unsigned int]': graph/paths.cc:36:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = int; size_t = long unsigned int]': graph/paths.cc:620:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = long int; size_t = long unsigned int]': graph/paths.cc:621:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling misc/profiler.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/profiler.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/profiler.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/profiler.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/profiler.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/proxy.h:10, from include/profiler.h:10, from misc/profiler.cc:7: include/devcomm.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/devcomm.h:350:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 350 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/devcomm.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/devcomm.h:351:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 351 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:62, from include/info.h:13, from include/proxy.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ misc/profiler.cc: In function 'ncclResult_t ncclProfilingRecord(ncclProxyArgs*, int, int, int)': misc/profiler.cc:113:56: warning: unused parameter 'args' [-Wunused-parameter] 113 | ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { return ncclSuccess; } | ~~~~~~~~~~~~~~~~~~~~~~^~~~ misc/profiler.cc:113:66: warning: unused parameter 'sub' [-Wunused-parameter] 113 | ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { return ncclSuccess; } | ~~~~^~~ misc/profiler.cc:113:75: warning: unused parameter 'step' [-Wunused-parameter] 113 | ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { return ncclSuccess; } | ~~~~^~~~ misc/profiler.cc:113:85: warning: unused parameter 'state' [-Wunused-parameter] 113 | ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { return ncclSuccess; } | ~~~~^~~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling collectives/reduce.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/reduce.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/reduce.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c collectives/reduce.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/reduce.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from include/enqueue.h:10, from collectives/reduce.cc:7: include/devcomm.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/devcomm.h:350:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 350 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/devcomm.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/devcomm.h:351:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 351 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:62, from include/info.h:13, from include/graph.h:114, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ collectives/reduce.cc: In function 'ncclResult_t ncclReduce(const void*, void*, size_t, ncclDataType_t, ncclRedOp_t, int, ncclComm_t, cudaStream_t)': collectives/reduce.cc:25:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::description' [-Wmissing-field-initializers] 25 | }; | ^ collectives/reduce.cc:25:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::arrayOrUnionDetail' [-Wmissing-field-initializers] collectives/reduce.cc:25:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::offset' [-Wmissing-field-initializers] collectives/reduce.cc:25:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives/reduce.cc:25:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives/reduce.cc:25:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives/reduce.cc:25:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives/reduce.cc:25:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives/reduce.cc:25:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives/reduce.cc:31:42: warning: missing initializer for member 'ncclInfo::opFull' [-Wmissing-field-initializers] 31 | REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS }; | ^ collectives/reduce.cc:31:42: warning: missing initializer for member 'ncclInfo::algorithm' [-Wmissing-field-initializers] collectives/reduce.cc:31:42: warning: missing initializer for member 'ncclInfo::protocol' [-Wmissing-field-initializers] collectives/reduce.cc:31:42: warning: missing initializer for member 'ncclInfo::pattern' [-Wmissing-field-initializers] collectives/reduce.cc:31:42: warning: missing initializer for member 'ncclInfo::nChannels' [-Wmissing-field-initializers] collectives/reduce.cc:31:42: warning: missing initializer for member 'ncclInfo::nThreads' [-Wmissing-field-initializers] collectives/reduce.cc:31:42: warning: missing initializer for member 'ncclInfo::nBytes' [-Wmissing-field-initializers] collectives/reduce.cc:31:42: warning: missing initializer for member 'ncclInfo::nstepsPerLoop' [-Wmissing-field-initializers] collectives/reduce.cc:31:42: warning: missing initializer for member 'ncclInfo::nchunksPerLoop' [-Wmissing-field-initializers] collectives/reduce.cc:31:42: warning: missing initializer for member 'ncclInfo::chunkSize' [-Wmissing-field-initializers] collectives/reduce.cc:31:42: warning: missing initializer for member 'ncclInfo::channelId' [-Wmissing-field-initializers] make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling bootstrap.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/bootstrap.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/bootstrap.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c bootstrap.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/bootstrap.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/core.h:62, from bootstrap.cc:8: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from include/transport.h:10, from include/comm.h:10, from include/bootstrap.h:11, from bootstrap.cc:10: include/devcomm.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/devcomm.h:350:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 350 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/devcomm.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/devcomm.h:351:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 351 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ bootstrap.cc: In function 'ncclResult_t bootstrapCreateRoot(ncclBootstrapHandle*, bool)': bootstrap.cc:169:75: warning: unused parameter 'idFromEnv' [-Wunused-parameter] 169 | ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv) { | ~~~~~^~~~~~~~~ bootstrap.cc: In function 'ncclResult_t bootstrapInit(ncclBootstrapHandle*, ncclComm*)': bootstrap.cc:235:29: warning: missing initializer for member 'extInfo::nranks' [-Wmissing-field-initializers] 235 | struct extInfo info = { 0 }; | ^ bootstrap.cc:235:29: warning: missing initializer for member 'extInfo::extAddressListenRoot' [-Wmissing-field-initializers] bootstrap.cc:235:29: warning: missing initializer for member 'extInfo::extAddressListen' [-Wmissing-field-initializers] In file included from include/core.h:59: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclSocketAddress; size_t = long unsigned int]': bootstrap.cc:107:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclSocket; size_t = long unsigned int]': bootstrap.cc:174:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = bootstrapRootArgs; size_t = long unsigned int]': bootstrap.cc:179:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = bootstrapState; size_t = long unsigned int]': bootstrap.cc:237:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = unexConn; size_t = long unsigned int]': bootstrap.cc:478:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling proxy.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/proxy.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/proxy.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c proxy.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/proxy.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from proxy.cc:7: include/devcomm.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/devcomm.h:350:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 350 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/devcomm.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/devcomm.h:351:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 351 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:62, from include/info.h:13, from include/graph.h:114, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ proxy.cc: In function 'void ncclDumpProxyState(int)': proxy.cc:787:29: warning: unused parameter 'signal' [-Wunused-parameter] 787 | void ncclDumpProxyState(int signal) { | ~~~~^~~~~~ proxy.cc: In function 'ncclResult_t ncclProxyConnect(ncclComm*, int, int, int, ncclProxyConnector*)': proxy.cc:1037:35: warning: missing initializer for member 'ncclProxyInitReq::send' [-Wmissing-field-initializers] 1037 | struct ncclProxyInitReq req = {0}; | ^ proxy.cc:1037:35: warning: missing initializer for member 'ncclProxyInitReq::tpLocalRank' [-Wmissing-field-initializers] proxy.cc:1037:35: warning: missing initializer for member 'ncclProxyInitReq::tpRank' [-Wmissing-field-initializers] proxy.cc:1037:35: warning: missing initializer for member 'ncclProxyInitReq::sameProcess' [-Wmissing-field-initializers] proxy.cc:1044:37: warning: missing initializer for member 'ncclProxyInitResp::devShmPath' [-Wmissing-field-initializers] 1044 | struct ncclProxyInitResp resp = {0}; | ^ proxy.cc: In function 'ncclResult_t ncclProxyClientConvertFdBlocking(ncclComm*, ncclProxyConnector*, int, int*)': proxy.cc:1070:38: warning: missing initializer for member 'ncclIpcSocket::socketName' [-Wmissing-field-initializers] 1070 | struct ncclIpcSocket ipcSock = { 0 }; | ^ proxy.cc:1070:38: warning: missing initializer for member 'ncclIpcSocket::abortFlag' [-Wmissing-field-initializers] proxy.cc: In function 'ncclResult_t proxyConvertFd(ncclProxyLocalPeer*, void*, ncclProxyState*, int)': proxy.cc:1288:38: warning: missing initializer for member 'ncclIpcSocket::socketName' [-Wmissing-field-initializers] 1288 | struct ncclIpcSocket ipcSock = { 0 }; | ^ proxy.cc:1288:38: warning: missing initializer for member 'ncclIpcSocket::abortFlag' [-Wmissing-field-initializers] In file included from include/core.h:59: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclExpectedProxyResponse; size_t = long unsigned int]': proxy.cc:81:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclProxyPool; size_t = long unsigned int]': proxy.cc:193:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclProxyConnection; size_t = long unsigned int]': proxy.cc:943:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclSocket; size_t = long unsigned int]': proxy.cc:1021:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclProxyOps; size_t = long unsigned int]': proxy.cc:1022:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = void*; size_t = long unsigned int]': proxy.cc:1023:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclProxyAsyncOp; size_t = long unsigned int]': proxy.cc:1357:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = char; size_t = long unsigned int]': proxy.cc:1365:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclProxyState; size_t = long unsigned int]': proxy.cc:1552:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling collectives/all_gather.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/all_gather.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/all_gather.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c collectives/all_gather.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/all_gather.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from include/enqueue.h:10, from collectives/all_gather.cc:7: include/devcomm.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/devcomm.h:350:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 350 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/devcomm.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/devcomm.h:351:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 351 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:62, from include/info.h:13, from include/graph.h:114, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ collectives/all_gather.cc: In function 'ncclResult_t ncclAllGather(const void*, void*, size_t, ncclDataType_t, ncclComm_t, cudaStream_t)': collectives/all_gather.cc:17:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::description' [-Wmissing-field-initializers] 17 | }; | ^ collectives/all_gather.cc:17:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::arrayOrUnionDetail' [-Wmissing-field-initializers] collectives/all_gather.cc:17:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::offset' [-Wmissing-field-initializers] collectives/all_gather.cc:17:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives/all_gather.cc:17:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives/all_gather.cc:23:48: warning: missing initializer for member 'ncclInfo::opFull' [-Wmissing-field-initializers] 23 | ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS }; | ^ collectives/all_gather.cc:23:48: warning: missing initializer for member 'ncclInfo::algorithm' [-Wmissing-field-initializers] collectives/all_gather.cc:23:48: warning: missing initializer for member 'ncclInfo::protocol' [-Wmissing-field-initializers] collectives/all_gather.cc:23:48: warning: missing initializer for member 'ncclInfo::pattern' [-Wmissing-field-initializers] collectives/all_gather.cc:23:48: warning: missing initializer for member 'ncclInfo::nChannels' [-Wmissing-field-initializers] collectives/all_gather.cc:23:48: warning: missing initializer for member 'ncclInfo::nThreads' [-Wmissing-field-initializers] collectives/all_gather.cc:23:48: warning: missing initializer for member 'ncclInfo::nBytes' [-Wmissing-field-initializers] collectives/all_gather.cc:23:48: warning: missing initializer for member 'ncclInfo::nstepsPerLoop' [-Wmissing-field-initializers] collectives/all_gather.cc:23:48: warning: missing initializer for member 'ncclInfo::nchunksPerLoop' [-Wmissing-field-initializers] collectives/all_gather.cc:23:48: warning: missing initializer for member 'ncclInfo::chunkSize' [-Wmissing-field-initializers] collectives/all_gather.cc:23:48: warning: missing initializer for member 'ncclInfo::channelId' [-Wmissing-field-initializers] make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling transport/nvls.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/nvls.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/nvls.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport/nvls.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/nvls.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from transport/nvls.cc:9: include/devcomm.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/devcomm.h:350:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 350 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/devcomm.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/devcomm.h:351:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 351 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:62, from include/info.h:13, from include/graph.h:114, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ transport/nvls.cc: In function 'ncclResult_t nvlsCanConnect(int*, ncclTopoSystem*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*)': transport/nvls.cc:25:62: warning: unused parameter 'topo' [-Wunused-parameter] 25 | ncclResult_t nvlsCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~ transport/nvls.cc:25:90: warning: unused parameter 'graph' [-Wunused-parameter] 25 | ncclResult_t nvlsCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/nvls.cc:25:118: warning: unused parameter 'info1' [-Wunused-parameter] 25 | ncclResult_t nvlsCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/nvls.cc:25:146: warning: unused parameter 'info2' [-Wunused-parameter] 25 | ncclResult_t nvlsCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/nvls.cc: In function 'ncclResult_t nvlsSendFree(ncclConnector*)': transport/nvls.cc:31:49: warning: unused parameter 'send' [-Wunused-parameter] 31 | ncclResult_t nvlsSendFree(struct ncclConnector* send) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~ transport/nvls.cc: In function 'ncclResult_t nvlsRecvFree(ncclConnector*)': transport/nvls.cc:35:49: warning: unused parameter 'recv' [-Wunused-parameter] 35 | ncclResult_t nvlsRecvFree(struct ncclConnector* recv) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~ transport/nvls.cc: In function 'ncclResult_t nvlsGetProperties(ncclComm*, ncclNvlsSharedRes*, int, int, size_t)': transport/nvls.cc:46:49: warning: unused parameter 'comm' [-Wunused-parameter] 46 | ncclResult_t nvlsGetProperties(struct ncclComm *comm, struct ncclNvlsSharedRes* resources, int dev, int nranks, size_t size) { | ~~~~~~~~~~~~~~~~~^~~~ transport/nvls.cc: In function 'ncclResult_t nvlsGroupCreate(ncclComm*, ncclNvlsSharedRes*, int, unsigned int, char*)': transport/nvls.cc:69:47: warning: unused parameter 'comm' [-Wunused-parameter] 69 | ncclResult_t nvlsGroupCreate(struct ncclComm *comm, struct ncclNvlsSharedRes* resources, int rank, unsigned int nranks, char* shareableHandle) { | ~~~~~~~~~~~~~~~~~^~~~ transport/nvls.cc: In function 'ncclResult_t nvlsGroupAddDevice(ncclComm*, ncclNvlsSharedRes*)': transport/nvls.cc:91:50: warning: unused parameter 'comm' [-Wunused-parameter] 91 | ncclResult_t nvlsGroupAddDevice(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) { | ~~~~~~~~~~~~~~~~~^~~~ transport/nvls.cc: In function 'ncclResult_t nvlsGroupDisconnect(ncclComm*, ncclNvlsSharedRes*)': transport/nvls.cc:125:51: warning: unused parameter 'comm' [-Wunused-parameter] 125 | ncclResult_t nvlsGroupDisconnect(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) { | ~~~~~~~~~~~~~~~~~^~~~ transport/nvls.cc: In function 'ncclResult_t nvlsGroupBindMem(ncclComm*, ncclNvlsSharedRes*)': transport/nvls.cc:138:48: warning: unused parameter 'comm' [-Wunused-parameter] 138 | ncclResult_t nvlsGroupBindMem(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) { | ~~~~~~~~~~~~~~~~~^~~~ transport/nvls.cc: In function 'ncclResult_t nvlsGroupMapMem(ncclComm*, ncclNvlsSharedRes*)': transport/nvls.cc:184:47: warning: unused parameter 'comm' [-Wunused-parameter] 184 | ncclResult_t nvlsGroupMapMem(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) { | ~~~~~~~~~~~~~~~~~^~~~ transport/nvls.cc: In function 'ncclResult_t nvlsGroupUnmapMem(ncclComm*, ncclNvlsSharedRes*)': transport/nvls.cc:202:49: warning: unused parameter 'comm' [-Wunused-parameter] 202 | ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, struct ncclNvlsSharedRes* resources) { | ~~~~~~~~~~~~~~~~~^~~~ In file included from include/core.h:59: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclNvlsSharedRes; size_t = long unsigned int]': transport/nvls.cc:290:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling collectives/broadcast.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/broadcast.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/broadcast.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c collectives/broadcast.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/broadcast.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from include/enqueue.h:10, from collectives/broadcast.cc:7: include/devcomm.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/devcomm.h:350:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 350 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/devcomm.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/devcomm.h:351:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 351 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:62, from include/info.h:13, from include/graph.h:114, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ collectives/broadcast.cc: In function 'ncclResult_t ncclBroadcast(const void*, void*, size_t, ncclDataType_t, int, ncclComm_t, cudaStream_t)': collectives/broadcast.cc:21:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::description' [-Wmissing-field-initializers] 21 | }; | ^ collectives/broadcast.cc:21:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::arrayOrUnionDetail' [-Wmissing-field-initializers] collectives/broadcast.cc:21:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::offset' [-Wmissing-field-initializers] collectives/broadcast.cc:21:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives/broadcast.cc:21:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives/broadcast.cc:21:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] collectives/broadcast.cc:21:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] collectives/broadcast.cc:27:48: warning: missing initializer for member 'ncclInfo::opFull' [-Wmissing-field-initializers] 27 | BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS }; | ^ collectives/broadcast.cc:27:48: warning: missing initializer for member 'ncclInfo::algorithm' [-Wmissing-field-initializers] collectives/broadcast.cc:27:48: warning: missing initializer for member 'ncclInfo::protocol' [-Wmissing-field-initializers] collectives/broadcast.cc:27:48: warning: missing initializer for member 'ncclInfo::pattern' [-Wmissing-field-initializers] collectives/broadcast.cc:27:48: warning: missing initializer for member 'ncclInfo::nChannels' [-Wmissing-field-initializers] collectives/broadcast.cc:27:48: warning: missing initializer for member 'ncclInfo::nThreads' [-Wmissing-field-initializers] collectives/broadcast.cc:27:48: warning: missing initializer for member 'ncclInfo::nBytes' [-Wmissing-field-initializers] collectives/broadcast.cc:27:48: warning: missing initializer for member 'ncclInfo::nstepsPerLoop' [-Wmissing-field-initializers] collectives/broadcast.cc:27:48: warning: missing initializer for member 'ncclInfo::nchunksPerLoop' [-Wmissing-field-initializers] collectives/broadcast.cc:27:48: warning: missing initializer for member 'ncclInfo::chunkSize' [-Wmissing-field-initializers] collectives/broadcast.cc:27:48: warning: missing initializer for member 'ncclInfo::channelId' [-Wmissing-field-initializers] make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling graph/xml.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/xml.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/xml.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/xml.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/xml.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/core.h:62, from graph/xml.cc:12: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ graph/xml.cc: In function 'ncclResult_t ncclTopoGetXmlFromCpu(ncclXmlNode*, ncclXml*)': graph/xml.cc:370:81: warning: unused parameter 'xml' [-Wunused-parameter] 370 | ncclResult_t ncclTopoGetXmlFromCpu(struct ncclXmlNode* cpuNode, struct ncclXml* xml) { | ~~~~~~~~~~~~~~~~^~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling graph/topo.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/topo.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/topo.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/topo.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/topo.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/core.h:62, from graph/topo.cc:7: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from include/graph.h:11, from graph/topo.cc:8: include/devcomm.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/devcomm.h:350:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 350 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/devcomm.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/devcomm.h:351:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 351 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ graph/topo.cc: In function 'ncclResult_t pciPathToInt64(char*, int, int, int64_t*)': graph/topo.cc:31:57: warning: unused parameter 'minOffset' [-Wunused-parameter] 31 | ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) { | ~~~~^~~~~~~~~ graph/topo.cc: In function 'ncclResult_t ncclTopoAddGpu(ncclXmlNode*, ncclTopoSystem*, ncclTopoNode*)': graph/topo.cc:363:80: warning: unused parameter 'system' [-Wunused-parameter] 363 | ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* system, struct ncclTopoNode* gpu) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~ In file included from include/core.h:59: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = long int; size_t = long unsigned int]': graph/topo.cc:188:7: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclTopoSystem; size_t = long unsigned int]': graph/topo.cc:544:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclXml; size_t = long unsigned int]': graph/topo.cc:597:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = int; size_t = long unsigned int]': graph/topo.cc:687:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = long unsigned int; size_t = long unsigned int]': graph/topo.cc:711:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling transport/net_ib.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/net_ib.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/net_ib.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport/net_ib.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/net_ib.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/core.h:62, from transport/net_ib.cc:8: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from include/transport.h:10, from include/comm.h:10, from include/net.h:12, from transport/net_ib.cc:10: include/devcomm.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/devcomm.h:350:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 350 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/devcomm.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/devcomm.h:351:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 351 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ transport/net_ib.cc: In function 'ncclResult_t ncclIbInit(ncclDebugLogger_t)': transport/net_ib.cc:159:43: warning: unused parameter 'logFunction' [-Wunused-parameter] 159 | ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { | ~~~~~~~~~~~~~~~~~~^~~~~~~~~~~ transport/net_ib.cc: In function 'ncclResult_t ncclIbGdrSupport(int)': transport/net_ib.cc:281:35: warning: unused parameter 'ibDev' [-Wunused-parameter] 281 | ncclResult_t ncclIbGdrSupport(int ibDev) { | ~~~~^~~~~ transport/net_ib.cc: In function 'ncclResult_t ncclIbRegMrDmaBuf(void*, void*, size_t, int, uint64_t, int, void**)': transport/net_ib.cc:879:73: warning: unused parameter 'type' [-Wunused-parameter] 879 | ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { | ~~~~^~~~ In file included from include/core.h:59: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclIbListenComm; size_t = long unsigned int]': transport/net_ib.cc:598:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling transport/net_socket.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/net_socket.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/net_socket.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport/net_socket.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/net_socket.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from transport/net_socket.cc:7: include/devcomm.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/devcomm.h:350:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 350 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/devcomm.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/devcomm.h:351:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 351 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:62, from include/info.h:13, from include/graph.h:114, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ transport/net_socket.cc: In function 'ncclResult_t ncclNetSocketInit(ncclDebugLogger_t)': transport/net_socket.cc:38:50: warning: unused parameter 'logFunction' [-Wunused-parameter] 38 | ncclResult_t ncclNetSocketInit(ncclDebugLogger_t logFunction) { | ~~~~~~~~~~~~~~~~~~^~~~~~~~~~~ transport/net_socket.cc: In function 'ncclResult_t ncclNetSocketRegMr(void*, void*, int, int, void**)': transport/net_socket.cc:535:39: warning: unused parameter 'comm' [-Wunused-parameter] 535 | ncclResult_t ncclNetSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) { | ~~~~~~^~~~ transport/net_socket.cc:535:51: warning: unused parameter 'data' [-Wunused-parameter] 535 | ncclResult_t ncclNetSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) { | ~~~~~~^~~~ transport/net_socket.cc:535:61: warning: unused parameter 'size' [-Wunused-parameter] 535 | ncclResult_t ncclNetSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) { | ~~~~^~~~ transport/net_socket.cc:535:84: warning: unused parameter 'mhandle' [-Wunused-parameter] 535 | ncclResult_t ncclNetSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) { | ~~~~~~~^~~~~~~ transport/net_socket.cc: In function 'ncclResult_t ncclNetSocketDeregMr(void*, void*)': transport/net_socket.cc:538:41: warning: unused parameter 'comm' [-Wunused-parameter] 538 | ncclResult_t ncclNetSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; } | ~~~~~~^~~~ transport/net_socket.cc:538:53: warning: unused parameter 'mhandle' [-Wunused-parameter] 538 | ncclResult_t ncclNetSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; } | ~~~~~~^~~~~~~ transport/net_socket.cc: In function 'ncclResult_t ncclNetSocketIsend(void*, void*, int, int, void*, void**)': transport/net_socket.cc:540:75: warning: unused parameter 'tag' [-Wunused-parameter] 540 | ncclResult_t ncclNetSocketIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { | ~~~~^~~ transport/net_socket.cc:540:86: warning: unused parameter 'mhandle' [-Wunused-parameter] 540 | ncclResult_t ncclNetSocketIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { | ~~~~~~^~~~~~~ transport/net_socket.cc: In function 'ncclResult_t ncclNetSocketIrecv(void*, int, void**, int*, int*, void**, void**)': transport/net_socket.cc:546:86: warning: unused parameter 'tags' [-Wunused-parameter] 546 | ncclResult_t ncclNetSocketIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { | ~~~~~^~~~ transport/net_socket.cc:546:99: warning: unused parameter 'mhandles' [-Wunused-parameter] 546 | ncclResult_t ncclNetSocketIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { | ~~~~~~~^~~~~~~~ transport/net_socket.cc: In function 'ncclResult_t ncclNetSocketIflush(void*, int, void**, int*, void**, void**)': transport/net_socket.cc:553:40: warning: unused parameter 'recvComm' [-Wunused-parameter] 553 | ncclResult_t ncclNetSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { | ~~~~~~^~~~~~~~ transport/net_socket.cc:553:54: warning: unused parameter 'n' [-Wunused-parameter] 553 | ncclResult_t ncclNetSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { | ~~~~^ transport/net_socket.cc:553:64: warning: unused parameter 'data' [-Wunused-parameter] 553 | ncclResult_t ncclNetSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { | ~~~~~~~^~~~ transport/net_socket.cc:553:75: warning: unused parameter 'sizes' [-Wunused-parameter] 553 | ncclResult_t ncclNetSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { | ~~~~~^~~~~ transport/net_socket.cc:553:89: warning: unused parameter 'mhandles' [-Wunused-parameter] 553 | ncclResult_t ncclNetSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { | ~~~~~~~^~~~~~~~ transport/net_socket.cc:553:106: warning: unused parameter 'request' [-Wunused-parameter] 553 | ncclResult_t ncclNetSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { | ~~~~~~~^~~~~~~ In file included from include/core.h:59: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclNetSocketListenComm; size_t = long unsigned int]': transport/net_socket.cc:291:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclNetSocketComm; size_t = long unsigned int]': transport/net_socket.cc:320:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclSocket; size_t = long unsigned int]': transport/net_socket.cc:370:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclNetSocketTask; size_t = long unsigned int]': transport/net_socket.cc:432:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling group.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/group.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/group.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c group.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/group.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from include/group.h:11, from group.cc:7: include/devcomm.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/devcomm.h:350:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 350 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/devcomm.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/devcomm.h:351:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 351 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:62, from include/info.h:13, from include/graph.h:114, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from include/core.h:59: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclPreconnectJob; size_t = long unsigned int]': group.cc:266:7: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling channel.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/channel.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/channel.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c channel.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/channel.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from include/channel.h:9, from channel.cc:7: include/devcomm.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/devcomm.h:350:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 350 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/devcomm.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/devcomm.h:351:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 351 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:62, from include/info.h:13, from include/graph.h:114, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from include/core.h:59: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclChannelPeer; size_t = long unsigned int]': channel.cc:29:7: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling transport/shm.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/shm.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/shm.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport/shm.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/shm.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from transport/shm.cc:7: include/devcomm.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/devcomm.h:350:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 350 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/devcomm.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/devcomm.h:351:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 351 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:62, from include/info.h:13, from include/graph.h:114, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ transport/shm.cc: In function 'ncclResult_t shmCanConnect(int*, ncclTopoSystem*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*)': transport/shm.cc:50:96: warning: unused parameter 'graph' [-Wunused-parameter] 50 | static ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/shm.cc: In function 'ncclResult_t shmSendSetup(ncclComm*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*, ncclConnect*, ncclConnector*, int, int)': transport/shm.cc:76:79: warning: unused parameter 'graph' [-Wunused-parameter] 76 | static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/shm.cc:76:226: warning: unused parameter 'connIndex' [-Wunused-parameter] 76 | static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { | ~~~~^~~~~~~~~ transport/shm.cc: In function 'ncclResult_t shmRecvSetup(ncclComm*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*, ncclConnect*, ncclConnector*, int, int)': transport/shm.cc:99:79: warning: unused parameter 'graph' [-Wunused-parameter] 99 | static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/shm.cc:99:107: warning: unused parameter 'myInfo' [-Wunused-parameter] 99 | static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { | ~~~~~~~~~~~~~~~~~~~~~^~~~~~ transport/shm.cc:99:136: warning: unused parameter 'peerInfo' [-Wunused-parameter] 99 | static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { | ~~~~~~~~~~~~~~~~~~~~~^~~~~~~~ transport/shm.cc:99:211: warning: unused parameter 'channelId' [-Wunused-parameter] 99 | static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { | ~~~~^~~~~~~~~ transport/shm.cc:99:226: warning: unused parameter 'connIndex' [-Wunused-parameter] 99 | static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { | ~~~~^~~~~~~~~ transport/shm.cc: In function 'ncclResult_t shmSendConnect(ncclComm*, ncclConnect*, int, int, ncclConnector*)': transport/shm.cc:161:130: warning: missing initializer for member 'shmProxyInfo::step' [-Wmissing-field-initializers] 161 | struct shmProxyInfo proxyInfo = { NULL, NULL, send->conn.buffs[NCCL_PROTO_SIMPLE], resources->hostMem, resources->remHostMem }; | ^ transport/shm.cc:161:130: warning: missing initializer for member 'shmProxyInfo::stream' [-Wmissing-field-initializers] transport/shm.cc:161:130: warning: missing initializer for member 'shmProxyInfo::events' [-Wmissing-field-initializers] transport/shm.cc:135:96: warning: unused parameter 'nranks' [-Wunused-parameter] 135 | static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { | ~~~~^~~~~~ transport/shm.cc:135:108: warning: unused parameter 'rank' [-Wunused-parameter] 135 | static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { | ~~~~^~~~ transport/shm.cc: In function 'ncclResult_t shmRecvConnect(ncclComm*, ncclConnect*, int, int, ncclConnector*)': transport/shm.cc:191:130: warning: missing initializer for member 'shmProxyInfo::step' [-Wmissing-field-initializers] 191 | struct shmProxyInfo proxyInfo = { NULL, NULL, recv->conn.buffs[NCCL_PROTO_SIMPLE], resources->remHostMem, resources->hostMem }; | ^ transport/shm.cc:191:130: warning: missing initializer for member 'shmProxyInfo::stream' [-Wmissing-field-initializers] transport/shm.cc:191:130: warning: missing initializer for member 'shmProxyInfo::events' [-Wmissing-field-initializers] transport/shm.cc:170:96: warning: unused parameter 'nranks' [-Wunused-parameter] 170 | static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { | ~~~~^~~~~~ transport/shm.cc:170:108: warning: unused parameter 'rank' [-Wunused-parameter] 170 | static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { | ~~~~^~~~ transport/shm.cc: In function 'ncclResult_t shmSendProxyConnect(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/shm.cc:219:179: warning: unused parameter 'done' [-Wunused-parameter] 219 | static ncclResult_t shmSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~^~~~ transport/shm.cc: In function 'ncclResult_t shmRecvProxyConnect(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/shm.cc:237:179: warning: unused parameter 'done' [-Wunused-parameter] 237 | static ncclResult_t shmRecvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~^~~~ transport/shm.cc: In function 'ncclResult_t shmSendProxyFree(ncclProxyConnection*, ncclProxyState*)': transport/shm.cc:255:101: warning: unused parameter 'proxyState' [-Wunused-parameter] 255 | static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~ transport/shm.cc: In function 'ncclResult_t shmRecvProxyFree(ncclProxyConnection*, ncclProxyState*)': transport/shm.cc:270:101: warning: unused parameter 'proxyState' [-Wunused-parameter] 270 | static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~ In file included from include/core.h:59: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = shmSendResources; size_t = long unsigned int]': transport/shm.cc:78:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = shmRecvResources; size_t = long unsigned int]': transport/shm.cc:101:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = shmProxyInfo; size_t = long unsigned int]': transport/shm.cc:221:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling transport/p2p.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/p2p.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/p2p.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport/p2p.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/p2p.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from transport/p2p.cc:7: include/devcomm.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/devcomm.h:350:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 350 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/devcomm.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/devcomm.h:351:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 351 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:62, from include/info.h:13, from include/graph.h:114, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ transport/p2p.cc: In function 'ncclResult_t p2pCanConnect(int*, ncclTopoSystem*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*)': transport/p2p.cc:103:89: warning: unused parameter 'graph' [-Wunused-parameter] 103 | ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/p2p.cc: In function 'ncclResult_t p2pRecvSetup(ncclComm*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*, ncclConnect*, ncclConnector*, int, int)': transport/p2p.cc:395:71: warning: unused parameter 'channelId' [-Wunused-parameter] 395 | struct ncclConnect* connectInfo, struct ncclConnector * recv, int channelId, int connIndex) { | ~~~~^~~~~~~~~ transport/p2p.cc: In function 'ncclResult_t p2pSendConnect(ncclComm*, ncclConnect*, int, int, ncclConnector*)': transport/p2p.cc:445:96: warning: unused parameter 'nranks' [-Wunused-parameter] 445 | static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { | ~~~~^~~~~~ transport/p2p.cc: In function 'ncclResult_t p2pRecvConnect(ncclComm*, ncclConnect*, int, int, ncclConnector*)': transport/p2p.cc:481:89: warning: unused parameter 'nranks' [-Wunused-parameter] 481 | ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { | ~~~~^~~~~~ transport/p2p.cc: In function 'ncclResult_t p2pRecvProxySetup(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/p2p.cc:600:102: warning: unused parameter 'proxyState' [-Wunused-parameter] 600 | static ncclResult_t p2pRecvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~ transport/p2p.cc: In function 'ncclResult_t p2pSendProxyConnect(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/p2p.cc:620:104: warning: unused parameter 'proxyState' [-Wunused-parameter] 620 | static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~ transport/p2p.cc:620:150: warning: unused parameter 'respBuff' [-Wunused-parameter] 620 | static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~~^~~~~~~~ transport/p2p.cc:620:164: warning: unused parameter 'respSize' [-Wunused-parameter] 620 | static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~^~~~~~~~ transport/p2p.cc:620:179: warning: unused parameter 'done' [-Wunused-parameter] 620 | static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~^~~~ transport/p2p.cc: In function 'ncclResult_t p2pSendProxyFree(ncclProxyConnection*, ncclProxyState*)': transport/p2p.cc:634:101: warning: unused parameter 'proxyState' [-Wunused-parameter] 634 | static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~ transport/p2p.cc: In function 'ncclResult_t p2pRecvProxyFree(ncclProxyConnection*, ncclProxyState*)': transport/p2p.cc:666:101: warning: unused parameter 'proxyState' [-Wunused-parameter] 666 | static ncclResult_t p2pRecvProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~ In file included from include/core.h:59: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = p2pResources; size_t = long unsigned int]': transport/p2p.cc:332:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = p2pShmProxyInfo; size_t = long unsigned int]': transport/p2p.cc:562:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = p2pCuMemProxyInfo; size_t = long unsigned int]': transport/p2p.cc:589:7: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling graph/search.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/search.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/search.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c graph/search.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/search.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/core.h:62, from graph/search.cc:7: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ In file included from include/graph.h:11, from graph/search.cc:8: include/devcomm.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/devcomm.h:350:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 350 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/devcomm.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/devcomm.h:351:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 351 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ graph/search.cc: In function 'float getTotalBw(ncclTopoSystem*, ncclTopoNode*)': graph/search.cc:27:48: warning: unused parameter 'system' [-Wunused-parameter] 27 | static float getTotalBw(struct ncclTopoSystem* system, struct ncclTopoNode* gpu) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~ In file included from include/core.h:59: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = int; size_t = long unsigned int]': graph/search.cc:377:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclXml; size_t = long unsigned int]': graph/search.cc:819:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling misc/gdrwrap.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/gdrwrap.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/gdrwrap.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c misc/gdrwrap.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/gdrwrap.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/core.h:62, from misc/gdrwrap.cc:10: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling transport/coll_net.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/coll_net.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/coll_net.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport/coll_net.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/coll_net.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from transport/coll_net.cc:7: include/devcomm.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/devcomm.h:350:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 350 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/devcomm.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/devcomm.h:351:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 351 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:62, from include/info.h:13, from include/graph.h:114, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ transport/coll_net.cc: In function 'ncclResult_t canConnect(int*, ncclTopoSystem*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*)': transport/coll_net.cc:134:65: warning: unused parameter 'topo' [-Wunused-parameter] 134 | static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~~~^~~~ transport/coll_net.cc:134:93: warning: unused parameter 'graph' [-Wunused-parameter] 134 | static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/coll_net.cc:134:121: warning: unused parameter 'info1' [-Wunused-parameter] 134 | static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/coll_net.cc:134:149: warning: unused parameter 'info2' [-Wunused-parameter] 134 | static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/coll_net.cc: In function 'ncclResult_t sendSetup(ncclComm*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*, ncclConnect*, ncclConnector*, int, int)': transport/coll_net.cc:151:29: warning: missing initializer for member 'setupReq::useGdr' [-Wmissing-field-initializers] 151 | struct setupReq req = { 0 }; | ^ transport/coll_net.cc:151:29: warning: missing initializer for member 'setupReq::needFlush' [-Wmissing-field-initializers] transport/coll_net.cc:151:29: warning: missing initializer for member 'setupReq::collNet' [-Wmissing-field-initializers] transport/coll_net.cc:150:133: warning: unused parameter 'peerInfo' [-Wunused-parameter] 150 | static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { | ~~~~~~~~~~~~~~~~~~~~~^~~~~~~~ transport/coll_net.cc:150:163: warning: unused parameter 'connectInfo' [-Wunused-parameter] 150 | static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { | ~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~ transport/coll_net.cc: In function 'ncclResult_t recvSetup(ncclComm*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*, ncclConnect*, ncclConnector*, int, int)': transport/coll_net.cc:171:29: warning: missing initializer for member 'setupReq::useGdr' [-Wmissing-field-initializers] 171 | struct setupReq req = { 0 }; | ^ transport/coll_net.cc:171:29: warning: missing initializer for member 'setupReq::needFlush' [-Wmissing-field-initializers] transport/coll_net.cc:171:29: warning: missing initializer for member 'setupReq::collNet' [-Wmissing-field-initializers] transport/coll_net.cc:170:133: warning: unused parameter 'peerInfo' [-Wunused-parameter] 170 | static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { | ~~~~~~~~~~~~~~~~~~~~~^~~~~~~~ transport/coll_net.cc: In function 'ncclResult_t sendFree(ncclConnector*)': transport/coll_net.cc:278:52: warning: unused parameter 'send' [-Wunused-parameter] 278 | static ncclResult_t sendFree(struct ncclConnector* send) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~ transport/coll_net.cc: In function 'ncclResult_t recvFree(ncclConnector*)': transport/coll_net.cc:282:52: warning: unused parameter 'recv' [-Wunused-parameter] 282 | static ncclResult_t recvFree(struct ncclConnector* recv) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~ transport/coll_net.cc: In function 'ncclResult_t sendProxySetup(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/coll_net.cc:286:145: warning: unused parameter 'respBuff' [-Wunused-parameter] 286 | static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~~^~~~~~~~ transport/coll_net.cc:286:159: warning: unused parameter 'respSize' [-Wunused-parameter] 286 | static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~^~~~~~~~ transport/coll_net.cc:286:174: warning: unused parameter 'done' [-Wunused-parameter] 286 | static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~^~~~ transport/coll_net.cc: In function 'ncclResult_t recvProxySetup(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/coll_net.cc:394:174: warning: unused parameter 'done' [-Wunused-parameter] 394 | static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~^~~~ transport/coll_net.cc: In function 'ncclResult_t sendProxyConnect(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/coll_net.cc:419:176: warning: unused parameter 'done' [-Wunused-parameter] 419 | static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~^~~~ transport/coll_net.cc: In function 'ncclResult_t recvProxyConnect(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/coll_net.cc:493:176: warning: unused parameter 'done' [-Wunused-parameter] 493 | static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~^~~~ In file included from include/core.h:59: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = sendResources; size_t = long unsigned int]': transport/coll_net.cc:291:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = sharedResources; size_t = long unsigned int]': transport/coll_net.cc:314:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = char (*)[128]; size_t = long unsigned int]': transport/coll_net.cc:327:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = recvResources; size_t = long unsigned int]': transport/coll_net.cc:399:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = gdr_mem_desc; size_t = long unsigned int]': include/gdrwrap.h:218:3: required from 'ncclResult_t ncclGdrCudaCalloc(T**, T**, size_t, void**) [with T = long unsigned int; size_t = long unsigned int]' transport/coll_net.cc:452:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling transport/net.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/net.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/net.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c transport/net.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/net.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from transport/net.cc:7: include/devcomm.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/devcomm.h:350:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 350 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/devcomm.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/devcomm.h:351:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 351 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:62, from include/info.h:13, from include/graph.h:114, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ transport/net.cc: In function 'ncclResult_t canConnect(int*, ncclTopoSystem*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*)': transport/net.cc:138:93: warning: unused parameter 'graph' [-Wunused-parameter] 138 | static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { | ~~~~~~~~~~~~~~~~~~~~~~^~~~~ transport/net.cc: In function 'ncclResult_t sendSetup(ncclComm*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*, ncclConnect*, ncclConnector*, int, int)': transport/net.cc:165:29: warning: missing initializer for member 'setupReq::tpLocalRank' [-Wmissing-field-initializers] 165 | struct setupReq req = { 0 }; | ^ transport/net.cc:165:29: warning: missing initializer for member 'setupReq::tpRemoteRank' [-Wmissing-field-initializers] transport/net.cc:165:29: warning: missing initializer for member 'setupReq::shared' [-Wmissing-field-initializers] transport/net.cc:165:29: warning: missing initializer for member 'setupReq::netDev' [-Wmissing-field-initializers] transport/net.cc:165:29: warning: missing initializer for member 'setupReq::useGdr' [-Wmissing-field-initializers] transport/net.cc:165:29: warning: missing initializer for member 'setupReq::needFlush' [-Wmissing-field-initializers] transport/net.cc:165:29: warning: missing initializer for member 'setupReq::channelId' [-Wmissing-field-initializers] transport/net.cc:165:29: warning: missing initializer for member 'setupReq::connIndex' [-Wmissing-field-initializers] transport/net.cc: In function 'ncclResult_t recvSetup(ncclComm*, ncclTopoGraph*, ncclPeerInfo*, ncclPeerInfo*, ncclConnect*, ncclConnector*, int, int)': transport/net.cc:203:29: warning: missing initializer for member 'setupReq::tpLocalRank' [-Wmissing-field-initializers] 203 | struct setupReq req = { 0 }; | ^ transport/net.cc:203:29: warning: missing initializer for member 'setupReq::tpRemoteRank' [-Wmissing-field-initializers] transport/net.cc:203:29: warning: missing initializer for member 'setupReq::shared' [-Wmissing-field-initializers] transport/net.cc:203:29: warning: missing initializer for member 'setupReq::netDev' [-Wmissing-field-initializers] transport/net.cc:203:29: warning: missing initializer for member 'setupReq::useGdr' [-Wmissing-field-initializers] transport/net.cc:203:29: warning: missing initializer for member 'setupReq::needFlush' [-Wmissing-field-initializers] transport/net.cc:203:29: warning: missing initializer for member 'setupReq::channelId' [-Wmissing-field-initializers] transport/net.cc:203:29: warning: missing initializer for member 'setupReq::connIndex' [-Wmissing-field-initializers] transport/net.cc: In function 'ncclResult_t sendConnect(ncclComm*, ncclConnect*, int, int, ncclConnector*)': transport/net.cc:270:93: warning: unused parameter 'nranks' [-Wunused-parameter] 270 | static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { | ~~~~^~~~~~ transport/net.cc:270:105: warning: unused parameter 'rank' [-Wunused-parameter] 270 | static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { | ~~~~^~~~ transport/net.cc: In function 'ncclResult_t recvConnect(ncclComm*, ncclConnect*, int, int, ncclConnector*)': transport/net.cc:346:93: warning: unused parameter 'nranks' [-Wunused-parameter] 346 | static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { | ~~~~^~~~~~ transport/net.cc:346:105: warning: unused parameter 'rank' [-Wunused-parameter] 346 | static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { | ~~~~^~~~ transport/net.cc: In function 'ncclResult_t sendProxySetup(ncclProxyConnection*, ncclProxyState*, void*, int, void*, int, int*)': transport/net.cc:499:145: warning: unused parameter 'respBuff' [-Wunused-parameter] 499 | static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { | ~~~~~~^~~~~~~~ In file included from include/core.h:59: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = connectMap; size_t = long unsigned int]': transport/net.cc:278:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclProxyPeer*; size_t = long unsigned int]': transport/net.cc:427:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclProxyPeer; size_t = long unsigned int]': transport/net.cc:431:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = sendResources; size_t = long unsigned int]': transport/net.cc:504:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = recvResources; size_t = long unsigned int]': transport/net.cc:532:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclSharedNetComms; size_t = long unsigned int]': transport/net.cc:577:9: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = gdr_mem_desc; size_t = long unsigned int]': include/gdrwrap.h:218:3: required from 'ncclResult_t ncclGdrCudaCalloc(T**, T**, size_t, void**) [with T = long unsigned int; size_t = long unsigned int]' transport/net.cc:650:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling enqueue.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/enqueue.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/enqueue.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c enqueue.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/enqueue.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from include/enqueue.h:10, from enqueue.cc:7: include/devcomm.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/devcomm.h:350:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 350 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/devcomm.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/devcomm.h:351:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 351 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:62, from include/info.h:13, from include/graph.h:114, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ enqueue.cc: In function 'ncclResult_t ncclInitKernelsForDevice(int, size_t*)': enqueue.cc:114:35: warning: missing initializer for member 'cudaFuncAttributes::constSizeBytes' [-Wmissing-field-initializers] 114 | cudaFuncAttributes attr = {0}; | ^ enqueue.cc:114:35: warning: missing initializer for member 'cudaFuncAttributes::localSizeBytes' [-Wmissing-field-initializers] enqueue.cc:114:35: warning: missing initializer for member 'cudaFuncAttributes::maxThreadsPerBlock' [-Wmissing-field-initializers] enqueue.cc:114:35: warning: missing initializer for member 'cudaFuncAttributes::numRegs' [-Wmissing-field-initializers] enqueue.cc:114:35: warning: missing initializer for member 'cudaFuncAttributes::ptxVersion' [-Wmissing-field-initializers] enqueue.cc:114:35: warning: missing initializer for member 'cudaFuncAttributes::binaryVersion' [-Wmissing-field-initializers] enqueue.cc:114:35: warning: missing initializer for member 'cudaFuncAttributes::cacheModeCA' [-Wmissing-field-initializers] enqueue.cc:114:35: warning: missing initializer for member 'cudaFuncAttributes::maxDynamicSharedSizeBytes' [-Wmissing-field-initializers] enqueue.cc:114:35: warning: missing initializer for member 'cudaFuncAttributes::preferredShmemCarveout' [-Wmissing-field-initializers] enqueue.cc:114:35: warning: missing initializer for member 'cudaFuncAttributes::clusterDimMustBeSet' [-Wmissing-field-initializers] enqueue.cc:114:35: warning: missing initializer for member 'cudaFuncAttributes::requiredClusterWidth' [-Wmissing-field-initializers] enqueue.cc:114:35: warning: missing initializer for member 'cudaFuncAttributes::requiredClusterHeight' [-Wmissing-field-initializers] enqueue.cc:114:35: warning: missing initializer for member 'cudaFuncAttributes::requiredClusterDepth' [-Wmissing-field-initializers] enqueue.cc:114:35: warning: missing initializer for member 'cudaFuncAttributes::clusterSchedulingPolicyPreference' [-Wmissing-field-initializers] enqueue.cc:114:35: warning: missing initializer for member 'cudaFuncAttributes::nonPortableClusterSizeAllowed' [-Wmissing-field-initializers] enqueue.cc:114:35: warning: missing initializer for member 'cudaFuncAttributes::reserved' [-Wmissing-field-initializers] enqueue.cc: In function 'ncclResult_t addP2pToPlan(ncclComm*, ncclKernelPlan*, int*, bool, int, int, void*, size_t, bool)': enqueue.cc:361:3: warning: missing initializer for member 'ncclInfo::opFull' [-Wmissing-field-initializers] 361 | }; | ^ enqueue.cc:361:3: warning: missing initializer for member 'ncclInfo::algorithm' [-Wmissing-field-initializers] enqueue.cc:361:3: warning: missing initializer for member 'ncclInfo::protocol' [-Wmissing-field-initializers] enqueue.cc:361:3: warning: missing initializer for member 'ncclInfo::pattern' [-Wmissing-field-initializers] enqueue.cc:361:3: warning: missing initializer for member 'ncclInfo::nChannels' [-Wmissing-field-initializers] enqueue.cc:361:3: warning: missing initializer for member 'ncclInfo::nThreads' [-Wmissing-field-initializers] enqueue.cc:361:3: warning: missing initializer for member 'ncclInfo::nBytes' [-Wmissing-field-initializers] enqueue.cc:361:3: warning: missing initializer for member 'ncclInfo::nstepsPerLoop' [-Wmissing-field-initializers] enqueue.cc:361:3: warning: missing initializer for member 'ncclInfo::nchunksPerLoop' [-Wmissing-field-initializers] enqueue.cc:361:3: warning: missing initializer for member 'ncclInfo::chunkSize' [-Wmissing-field-initializers] enqueue.cc:361:3: warning: missing initializer for member 'ncclInfo::channelId' [-Wmissing-field-initializers] enqueue.cc:375:35: warning: missing initializer for member 'ncclWorkElemP2p::proto' [-Wmissing-field-initializers] 375 | struct ncclWorkElemP2p elem = {0}; | ^ enqueue.cc:375:35: warning: missing initializer for member 'ncclWorkElemP2p::p2pType' [-Wmissing-field-initializers] enqueue.cc:375:35: warning: missing initializer for member 'ncclWorkElemP2p::nWarps' [-Wmissing-field-initializers] enqueue.cc:375:35: warning: missing initializer for member 'ncclWorkElemP2p::warpStart' [-Wmissing-field-initializers] enqueue.cc:375:35: warning: missing initializer for member 'ncclWorkElemP2p::ngroups' [-Wmissing-field-initializers] enqueue.cc:375:35: warning: missing initializer for member 'ncclWorkElemP2p::buffHi32' [-Wmissing-field-initializers] enqueue.cc:375:35: warning: missing initializer for member 'ncclWorkElemP2p::buffLo32' [-Wmissing-field-initializers] enqueue.cc:375:35: warning: missing initializer for member 'ncclWorkElemP2p::countHi32' [-Wmissing-field-initializers] enqueue.cc:375:35: warning: missing initializer for member 'ncclWorkElemP2p::countLo32' [-Wmissing-field-initializers] enqueue.cc:375:35: warning: missing initializer for member 'ncclWorkElemP2p::chunkSize' [-Wmissing-field-initializers] enqueue.cc: In function 'ncclResult_t ncclLaunchKernel(ncclComm*, ncclKernelPlan*)': enqueue.cc:1052:41: warning: missing initializer for member 'cudaLaunchConfig_st::blockDim' [-Wmissing-field-initializers] 1052 | cudaLaunchConfig_t launchConfig = {0}; | ^ enqueue.cc:1052:41: warning: missing initializer for member 'cudaLaunchConfig_st::dynamicSmemBytes' [-Wmissing-field-initializers] enqueue.cc:1052:41: warning: missing initializer for member 'cudaLaunchConfig_st::stream' [-Wmissing-field-initializers] enqueue.cc:1052:41: warning: missing initializer for member 'cudaLaunchConfig_st::attrs' [-Wmissing-field-initializers] enqueue.cc:1052:41: warning: missing initializer for member 'cudaLaunchConfig_st::numAttrs' [-Wmissing-field-initializers] enqueue.cc: In function 'ncclResult_t ncclLaunchPrepare(ncclComm*)': enqueue.cc:686:35: warning: 'fuseOk' may be used uninitialized [-Wmaybe-uninitialized] 686 | NCCLCHECK(addP2pToPlan(comm, plan, nWorkBudget, /*isSendNotRecv=*/false, recvPeer, recv->chunk, recvPtr, recvChunkBytes, fuseOk)); | ^ enqueue.cc:639:8: note: 'fuseOk' was declared here 639 | bool fuseOk; | ^~~~~~ make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Compiling init.cc > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/init.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/init.o` g++ -I. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -Iinclude -c init.cc -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/init.o make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' In file included from include/transport.h:10, from include/comm.h:10, from include/channel.h:9, from init.cc:8: include/devcomm.h: In function 'constexpr int ncclNvlsUnrollBytes(int)': include/devcomm.h:350:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 350 | __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } | ^ include/devcomm.h: In function 'constexpr int ncclNvlsUnrollInsns(int)': include/devcomm.h:351:59: warning: unused parameter 'cudaArch' [-Wunused-parameter] 351 | __host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } | ^ In file included from include/core.h:62, from include/info.h:13, from include/graph.h:114, from include/transport.h:11: include/nvtx.h: At global scope: include/nvtx.h:66:21: warning: missing initializer for member 'nvtxPayloadSchemaAttr_t::schemaId' [-Wmissing-field-initializers] 66 | nullptr, 0, 0, 0}; | ^ init.cc: In function 'ncclResult_t commGetSplitInfo(ncclComm*, ncclComm*, int, int, int*, int*, int*)': init.cc:1271:55: warning: unused parameter 'comm' [-Wunused-parameter] 1271 | static ncclResult_t commGetSplitInfo(struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* nRanksRet, int* myRankRet, int* parentRanksRet) { | ~~~~~~~~~~~~~~~~~^~~~ init.cc: At global scope: init.cc:1622:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::description' [-Wmissing-field-initializers] 1622 | }; | ^ init.cc:1622:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::arrayOrUnionDetail' [-Wmissing-field-initializers] init.cc:1622:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::offset' [-Wmissing-field-initializers] init.cc:1622:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] init.cc:1622:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] init.cc:1622:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] init.cc:1622:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] init.cc:1622:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] init.cc:1622:1: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] init.cc: In function 'ncclResult_t ncclCommInitAll(ncclComm**, int, const int*)': init.cc:1649:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::description' [-Wmissing-field-initializers] 1649 | }; | ^ init.cc:1649:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::arrayOrUnionDetail' [-Wmissing-field-initializers] init.cc:1649:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::offset' [-Wmissing-field-initializers] init.cc:1649:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::semantics' [-Wmissing-field-initializers] init.cc:1649:3: warning: missing initializer for member 'nvtxPayloadSchemaEntry_t::reserved' [-Wmissing-field-initializers] init.cc: In function 'const char* ncclGetLastError(ncclComm_t)': init.cc:2067:41: warning: unused parameter 'comm' [-Wunused-parameter] 2067 | const char* ncclGetLastError(ncclComm_t comm) { | ~~~~~~~~~~~^~~~ In file included from include/core.h:59: include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = long unsigned int; size_t = long unsigned int]': init.cc:348:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclSharedResources; size_t = long unsigned int]': init.cc:356:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = int; size_t = long unsigned int]': init.cc:360:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = collNetTrySetup(ncclComm_t, ncclComm_t, ncclTopoGraph*)::collnetShareInfo; size_t = long unsigned int]': init.cc:571:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclCollNetSharedRes; size_t = long unsigned int]': init.cc:627:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = unsigned char [4][10]; size_t = long unsigned int]': init.cc:658:7: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclPeerInfo; size_t = long unsigned int]': init.cc:789:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = initTransportsRank(ncclComm*, ncclComm*)::allGatherInfo; size_t = long unsigned int]': init.cc:923:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclNodeRanks; size_t = long unsigned int]': init.cc:957:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclTopoRanks*; size_t = long unsigned int]': init.cc:991:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclComm; size_t = long unsigned int]': init.cc:1583:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = unsigned int; size_t = long unsigned int]': init.cc:1585:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclCommInitRankAsyncJob; size_t = long unsigned int]': init.cc:1592:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = ncclCommFinalizeAsyncJob; size_t = long unsigned int]': init.cc:1794:3: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ include/alloc.h: In instantiation of 'ncclResult_t ncclCallocDebug(T**, size_t, const char*, int) [with T = gdr_mem_desc; size_t = long unsigned int]': include/gdrwrap.h:218:3: required from 'ncclResult_t ncclGdrCudaCalloc(T**, T**, size_t, void**) [with T = ncclWork; size_t = long unsigned int]' init.cc:407:5: required from here include/alloc.h:44:65: warning: unused parameter 'filefunc' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~~~~~~~~~^~~~~~~~ include/alloc.h:44:79: warning: unused parameter 'line' [-Wunused-parameter] 44 | ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { | ~~~~^~~~ make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z44ncclKernel_Broadcast_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z44ncclKernel_Broadcast_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_Broadcast_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclKernel_Broadcast_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_Broadcast_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z48ncclKernel_Broadcast_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_Broadcast_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z49ncclKernel_Broadcast_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_Broadcast_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclKernel_Broadcast_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclKernel_Broadcast_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclKernel_Broadcast_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z50ncclFunction_Broadcast_NVLS_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Broadcast_NVLS_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Broadcast_NVLS_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Broadcast_NVLS_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Broadcast_NVLS_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Broadcast_NVLS_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Broadcast_COLLNET_CHAIN_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Broadcast_COLLNET_CHAIN_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Broadcast_COLLNET_CHAIN_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Broadcast_COLLNET_DIRECT_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Broadcast_COLLNET_DIRECT_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Broadcast_COLLNET_DIRECT_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Broadcast_RING_SIMPLE_Sum_int8_tv 264 bytes stack frame, 280 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Broadcast_RING_LL128_Sum_int8_tv 352 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Broadcast_RING_LL_Sum_int8_tv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Broadcast_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Broadcast_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Broadcast_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z44ncclKernel_Broadcast_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z44ncclKernel_Broadcast_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_Broadcast_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclKernel_Broadcast_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_Broadcast_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z48ncclKernel_Broadcast_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_Broadcast_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z49ncclKernel_Broadcast_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_Broadcast_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclKernel_Broadcast_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclKernel_Broadcast_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclKernel_Broadcast_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z50ncclFunction_Broadcast_NVLS_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Broadcast_NVLS_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Broadcast_NVLS_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Broadcast_NVLS_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Broadcast_NVLS_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Broadcast_NVLS_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Broadcast_COLLNET_CHAIN_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Broadcast_COLLNET_CHAIN_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Broadcast_COLLNET_CHAIN_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Broadcast_COLLNET_DIRECT_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Broadcast_COLLNET_DIRECT_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Broadcast_COLLNET_DIRECT_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Broadcast_RING_SIMPLE_Sum_int8_tv 264 bytes stack frame, 280 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Broadcast_RING_LL128_Sum_int8_tv 352 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Broadcast_RING_LL_Sum_int8_tv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Broadcast_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Broadcast_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Broadcast_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z44ncclKernel_Broadcast_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z44ncclKernel_Broadcast_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_Broadcast_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclKernel_Broadcast_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_Broadcast_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z48ncclKernel_Broadcast_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_Broadcast_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z49ncclKernel_Broadcast_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_Broadcast_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclKernel_Broadcast_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclKernel_Broadcast_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclKernel_Broadcast_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z50ncclFunction_Broadcast_NVLS_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Broadcast_NVLS_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Broadcast_NVLS_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Broadcast_NVLS_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Broadcast_NVLS_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Broadcast_NVLS_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Broadcast_COLLNET_CHAIN_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Broadcast_COLLNET_CHAIN_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Broadcast_COLLNET_CHAIN_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Broadcast_COLLNET_DIRECT_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Broadcast_COLLNET_DIRECT_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Broadcast_COLLNET_DIRECT_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Broadcast_RING_SIMPLE_Sum_int8_tv 264 bytes stack frame, 280 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Broadcast_RING_LL128_Sum_int8_tv 352 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Broadcast_RING_LL_Sum_int8_tv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Broadcast_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Broadcast_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Broadcast_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z44ncclKernel_Broadcast_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z44ncclKernel_Broadcast_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_Broadcast_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclKernel_Broadcast_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_Broadcast_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z48ncclKernel_Broadcast_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_Broadcast_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z49ncclKernel_Broadcast_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_Broadcast_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclKernel_Broadcast_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 86 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_Broadcast_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclKernel_Broadcast_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z50ncclFunction_Broadcast_NVLS_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Broadcast_NVLS_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Broadcast_NVLS_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Broadcast_NVLS_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Broadcast_NVLS_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Broadcast_NVLS_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Broadcast_COLLNET_CHAIN_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Broadcast_COLLNET_CHAIN_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Broadcast_COLLNET_CHAIN_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Broadcast_COLLNET_DIRECT_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Broadcast_COLLNET_DIRECT_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Broadcast_COLLNET_DIRECT_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Broadcast_RING_SIMPLE_Sum_int8_tv 280 bytes stack frame, 296 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Broadcast_RING_LL128_Sum_int8_tv 416 bytes stack frame, 236 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Broadcast_RING_LL_Sum_int8_tv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Broadcast_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Broadcast_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Broadcast_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z44ncclKernel_Broadcast_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z44ncclKernel_Broadcast_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_Broadcast_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclKernel_Broadcast_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_Broadcast_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z48ncclKernel_Broadcast_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_Broadcast_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z49ncclKernel_Broadcast_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_Broadcast_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclKernel_Broadcast_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 84 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_Broadcast_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclKernel_Broadcast_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z50ncclFunction_Broadcast_NVLS_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Broadcast_NVLS_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Broadcast_NVLS_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Broadcast_NVLS_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Broadcast_NVLS_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Broadcast_NVLS_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Broadcast_COLLNET_CHAIN_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Broadcast_COLLNET_CHAIN_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Broadcast_COLLNET_CHAIN_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Broadcast_COLLNET_DIRECT_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Broadcast_COLLNET_DIRECT_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Broadcast_COLLNET_DIRECT_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Broadcast_RING_SIMPLE_Sum_int8_tv 416 bytes stack frame, 492 bytes spill stores, 972 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Broadcast_RING_LL128_Sum_int8_tv 408 bytes stack frame, 236 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Broadcast_RING_LL_Sum_int8_tv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Broadcast_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Broadcast_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Broadcast_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z44ncclKernel_Broadcast_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z44ncclKernel_Broadcast_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z39ncclKernel_Broadcast_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclKernel_Broadcast_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z48ncclKernel_Broadcast_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z48ncclKernel_Broadcast_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z49ncclKernel_Broadcast_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z49ncclKernel_Broadcast_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z39ncclKernel_Broadcast_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclKernel_Broadcast_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers ptxas info : Compiling entry function '_Z39ncclKernel_Broadcast_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclKernel_Broadcast_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Function properties for _Z50ncclFunction_Broadcast_NVLS_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Broadcast_NVLS_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Broadcast_NVLS_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Broadcast_NVLS_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Broadcast_NVLS_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Broadcast_NVLS_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Broadcast_COLLNET_CHAIN_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Broadcast_COLLNET_CHAIN_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Broadcast_COLLNET_CHAIN_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Broadcast_COLLNET_DIRECT_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Broadcast_COLLNET_DIRECT_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Broadcast_COLLNET_DIRECT_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Broadcast_RING_SIMPLE_Sum_int8_tv 328 bytes stack frame, 380 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Broadcast_RING_LL128_Sum_int8_tv 392 bytes stack frame, 216 bytes spill stores, 216 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Broadcast_RING_LL_Sum_int8_tv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Broadcast_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Broadcast_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Broadcast_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling broadcast.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_gather.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z44ncclKernel_AllGather_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z44ncclKernel_AllGather_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllGather_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclKernel_AllGather_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_AllGather_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z48ncclKernel_AllGather_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_AllGather_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z49ncclKernel_AllGather_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllGather_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclKernel_AllGather_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 91 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclKernel_AllGather_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclKernel_AllGather_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z50ncclFunction_AllGather_NVLS_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllGather_NVLS_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllGather_NVLS_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllGather_NVLS_SIMPLE_Sum_int8_tv 272 bytes stack frame, 288 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllGather_NVLS_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllGather_NVLS_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllGather_COLLNET_CHAIN_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllGather_COLLNET_CHAIN_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllGather_COLLNET_CHAIN_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllGather_COLLNET_DIRECT_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllGather_COLLNET_DIRECT_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllGather_COLLNET_DIRECT_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllGather_RING_SIMPLE_Sum_int8_tv 264 bytes stack frame, 288 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllGather_RING_LL128_Sum_int8_tv 360 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllGather_RING_LL_Sum_int8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllGather_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllGather_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllGather_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z44ncclKernel_AllGather_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z44ncclKernel_AllGather_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllGather_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclKernel_AllGather_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_AllGather_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z48ncclKernel_AllGather_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_AllGather_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z49ncclKernel_AllGather_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllGather_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclKernel_AllGather_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 88 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclKernel_AllGather_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclKernel_AllGather_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z50ncclFunction_AllGather_NVLS_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllGather_NVLS_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllGather_NVLS_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllGather_NVLS_SIMPLE_Sum_int8_tv 272 bytes stack frame, 288 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllGather_NVLS_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllGather_NVLS_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllGather_COLLNET_CHAIN_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllGather_COLLNET_CHAIN_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllGather_COLLNET_CHAIN_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllGather_COLLNET_DIRECT_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllGather_COLLNET_DIRECT_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllGather_COLLNET_DIRECT_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllGather_RING_SIMPLE_Sum_int8_tv 264 bytes stack frame, 288 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllGather_RING_LL128_Sum_int8_tv 360 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllGather_RING_LL_Sum_int8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllGather_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllGather_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllGather_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z44ncclKernel_AllGather_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z44ncclKernel_AllGather_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllGather_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclKernel_AllGather_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_AllGather_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z48ncclKernel_AllGather_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_AllGather_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z49ncclKernel_AllGather_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllGather_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclKernel_AllGather_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 91 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclKernel_AllGather_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclKernel_AllGather_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z50ncclFunction_AllGather_NVLS_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllGather_NVLS_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllGather_NVLS_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllGather_NVLS_SIMPLE_Sum_int8_tv 272 bytes stack frame, 288 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllGather_NVLS_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllGather_NVLS_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllGather_COLLNET_CHAIN_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllGather_COLLNET_CHAIN_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllGather_COLLNET_CHAIN_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllGather_COLLNET_DIRECT_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllGather_COLLNET_DIRECT_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllGather_COLLNET_DIRECT_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllGather_RING_SIMPLE_Sum_int8_tv 264 bytes stack frame, 288 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllGather_RING_LL128_Sum_int8_tv 360 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllGather_RING_LL_Sum_int8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllGather_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllGather_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllGather_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z44ncclKernel_AllGather_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z44ncclKernel_AllGather_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllGather_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclKernel_AllGather_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_AllGather_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z48ncclKernel_AllGather_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_AllGather_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z49ncclKernel_AllGather_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllGather_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclKernel_AllGather_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 92 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllGather_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclKernel_AllGather_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z50ncclFunction_AllGather_NVLS_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllGather_NVLS_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllGather_NVLS_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllGather_NVLS_SIMPLE_Sum_int8_tv 288 bytes stack frame, 304 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllGather_NVLS_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllGather_NVLS_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllGather_COLLNET_CHAIN_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllGather_COLLNET_CHAIN_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllGather_COLLNET_CHAIN_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllGather_COLLNET_DIRECT_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllGather_COLLNET_DIRECT_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllGather_COLLNET_DIRECT_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllGather_RING_SIMPLE_Sum_int8_tv 312 bytes stack frame, 344 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllGather_RING_LL128_Sum_int8_tv 424 bytes stack frame, 252 bytes spill stores, 252 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllGather_RING_LL_Sum_int8_tv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllGather_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllGather_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllGather_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z44ncclKernel_AllGather_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z44ncclKernel_AllGather_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllGather_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclKernel_AllGather_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_AllGather_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z48ncclKernel_AllGather_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_AllGather_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z49ncclKernel_AllGather_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllGather_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclKernel_AllGather_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 90 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllGather_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclKernel_AllGather_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z50ncclFunction_AllGather_NVLS_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllGather_NVLS_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllGather_NVLS_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllGather_NVLS_SIMPLE_Sum_int8_tv 440 bytes stack frame, 536 bytes spill stores, 824 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllGather_NVLS_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllGather_NVLS_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllGather_COLLNET_CHAIN_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllGather_COLLNET_CHAIN_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllGather_COLLNET_CHAIN_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllGather_COLLNET_DIRECT_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllGather_COLLNET_DIRECT_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllGather_COLLNET_DIRECT_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllGather_RING_SIMPLE_Sum_int8_tv 432 bytes stack frame, 604 bytes spill stores, 1332 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllGather_RING_LL128_Sum_int8_tv 424 bytes stack frame, 252 bytes spill stores, 252 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllGather_RING_LL_Sum_int8_tv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllGather_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllGather_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllGather_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z44ncclKernel_AllGather_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z44ncclKernel_AllGather_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z39ncclKernel_AllGather_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclKernel_AllGather_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z48ncclKernel_AllGather_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z48ncclKernel_AllGather_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z49ncclKernel_AllGather_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z49ncclKernel_AllGather_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z39ncclKernel_AllGather_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclKernel_AllGather_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers ptxas info : Compiling entry function '_Z39ncclKernel_AllGather_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclKernel_AllGather_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Function properties for _Z50ncclFunction_AllGather_NVLS_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllGather_NVLS_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllGather_NVLS_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllGather_NVLS_SIMPLE_Sum_int8_tv 424 bytes stack frame, 568 bytes spill stores, 976 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllGather_NVLS_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllGather_NVLS_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllGather_COLLNET_CHAIN_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllGather_COLLNET_CHAIN_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllGather_COLLNET_CHAIN_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllGather_COLLNET_DIRECT_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllGather_COLLNET_DIRECT_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllGather_COLLNET_DIRECT_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllGather_RING_SIMPLE_Sum_int8_tv 368 bytes stack frame, 512 bytes spill stores, 952 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllGather_RING_LL128_Sum_int8_tv 392 bytes stack frame, 212 bytes spill stores, 212 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllGather_RING_LL_Sum_int8_tv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllGather_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllGather_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllGather_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z43ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z38ncclKernel_Reduce_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z47ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z48ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z38ncclKernel_Reduce_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z38ncclKernel_Reduce_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Sum_uint64_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Sum_uint64_tv 456 bytes stack frame, 332 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Sum_uint64_tv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z43ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z38ncclKernel_Reduce_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z47ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z48ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z38ncclKernel_Reduce_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z38ncclKernel_Reduce_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Sum_uint64_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Sum_uint64_tv 456 bytes stack frame, 320 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Sum_uint64_tv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z43ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z38ncclKernel_Reduce_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z47ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z48ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z38ncclKernel_Reduce_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z38ncclKernel_Reduce_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Sum_uint64_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Sum_uint64_tv 456 bytes stack frame, 320 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Sum_uint64_tv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z43ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z38ncclKernel_Reduce_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z47ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z48ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z38ncclKernel_Reduce_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z38ncclKernel_Reduce_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Sum_uint64_tv 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Sum_uint64_tv 512 bytes stack frame, 408 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Sum_uint64_tv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z43ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z38ncclKernel_Reduce_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z47ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z48ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z38ncclKernel_Reduce_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z38ncclKernel_Reduce_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Sum_uint64_tv 320 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Sum_uint64_tv 504 bytes stack frame, 396 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Sum_uint64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z43ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z38ncclKernel_Reduce_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z47ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z47ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z48ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z48ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z38ncclKernel_Reduce_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z38ncclKernel_Reduce_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Sum_uint64_tv 256 bytes stack frame, 272 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Sum_uint64_tv 488 bytes stack frame, 336 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Sum_uint64_tv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z37ncclKernel_Reduce_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z37ncclKernel_Reduce_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z37ncclKernel_Reduce_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Sum_int64_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Sum_int64_tv 456 bytes stack frame, 332 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Sum_int64_tv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z37ncclKernel_Reduce_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z37ncclKernel_Reduce_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z37ncclKernel_Reduce_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Sum_int64_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Sum_int64_tv 456 bytes stack frame, 320 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Sum_int64_tv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z37ncclKernel_Reduce_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z37ncclKernel_Reduce_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z37ncclKernel_Reduce_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Sum_int64_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Sum_int64_tv 456 bytes stack frame, 320 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Sum_int64_tv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z37ncclKernel_Reduce_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z37ncclKernel_Reduce_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z37ncclKernel_Reduce_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Sum_int64_tv 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Sum_int64_tv 512 bytes stack frame, 408 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Sum_int64_tv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z37ncclKernel_Reduce_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z37ncclKernel_Reduce_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z37ncclKernel_Reduce_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Sum_int64_tv 320 bytes stack frame, 352 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Sum_int64_tv 504 bytes stack frame, 396 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Sum_int64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z37ncclKernel_Reduce_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z37ncclKernel_Reduce_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z37ncclKernel_Reduce_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Sum_int64_tv 256 bytes stack frame, 272 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Sum_int64_tv 488 bytes stack frame, 336 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Sum_int64_tv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z37ncclKernel_Reduce_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z37ncclKernel_Reduce_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z37ncclKernel_Reduce_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Sum_int32_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Sum_int32_tv 456 bytes stack frame, 328 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Sum_int32_tv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z37ncclKernel_Reduce_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z37ncclKernel_Reduce_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z37ncclKernel_Reduce_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Sum_int32_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Sum_int32_tv 448 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Sum_int32_tv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z37ncclKernel_Reduce_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z37ncclKernel_Reduce_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z37ncclKernel_Reduce_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Sum_int32_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Sum_int32_tv 448 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Sum_int32_tv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z37ncclKernel_Reduce_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z37ncclKernel_Reduce_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 82 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z37ncclKernel_Reduce_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Sum_int32_tv 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Sum_int32_tv 512 bytes stack frame, 396 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Sum_int32_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z37ncclKernel_Reduce_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z37ncclKernel_Reduce_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 84 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z37ncclKernel_Reduce_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Sum_int32_tv 304 bytes stack frame, 320 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Sum_int32_tv 504 bytes stack frame, 396 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Sum_int32_tv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z37ncclKernel_Reduce_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z37ncclKernel_Reduce_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z37ncclKernel_Reduce_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Sum_int32_tv 256 bytes stack frame, 272 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Sum_int32_tv 496 bytes stack frame, 356 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Sum_int32_tv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z43ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z38ncclKernel_Reduce_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z47ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z48ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z38ncclKernel_Reduce_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z38ncclKernel_Reduce_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Sum_uint32_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Sum_uint32_tv 456 bytes stack frame, 328 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Sum_uint32_tv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z43ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z38ncclKernel_Reduce_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z47ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z48ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z38ncclKernel_Reduce_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z38ncclKernel_Reduce_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Sum_uint32_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Sum_uint32_tv 448 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Sum_uint32_tv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z43ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z38ncclKernel_Reduce_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z47ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z48ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z38ncclKernel_Reduce_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z38ncclKernel_Reduce_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Sum_uint32_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Sum_uint32_tv 448 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Sum_uint32_tv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z43ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z38ncclKernel_Reduce_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z47ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z48ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z38ncclKernel_Reduce_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 82 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z38ncclKernel_Reduce_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Sum_uint32_tv 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Sum_uint32_tv 512 bytes stack frame, 396 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Sum_uint32_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z43ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z38ncclKernel_Reduce_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z47ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z48ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z38ncclKernel_Reduce_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 84 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z38ncclKernel_Reduce_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Sum_uint32_tv 304 bytes stack frame, 320 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Sum_uint32_tv 504 bytes stack frame, 396 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Sum_uint32_tv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z43ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z38ncclKernel_Reduce_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z47ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z47ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z48ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z48ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z38ncclKernel_Reduce_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers ptxas info : Compiling entry function '_Z38ncclKernel_Reduce_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z38ncclKernel_Reduce_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Sum_uint32_tv 256 bytes stack frame, 272 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Sum_uint32_tv 496 bytes stack frame, 356 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Sum_uint32_tv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z41ncclKernel_Reduce_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z41ncclKernel_Reduce_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z36ncclKernel_Reduce_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z45ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z46ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z36ncclKernel_Reduce_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z36ncclKernel_Reduce_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Sum_doublev 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Sum_doublev 456 bytes stack frame, 332 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Sum_doublev 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z41ncclKernel_Reduce_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z41ncclKernel_Reduce_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z36ncclKernel_Reduce_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z45ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z46ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z36ncclKernel_Reduce_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z36ncclKernel_Reduce_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Sum_doublev 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Sum_doublev 456 bytes stack frame, 320 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Sum_doublev 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z41ncclKernel_Reduce_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z41ncclKernel_Reduce_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z36ncclKernel_Reduce_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z45ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z46ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z36ncclKernel_Reduce_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z36ncclKernel_Reduce_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Sum_doublev 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Sum_doublev 456 bytes stack frame, 320 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Sum_doublev 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z41ncclKernel_Reduce_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z41ncclKernel_Reduce_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z36ncclKernel_Reduce_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z45ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z46ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z36ncclKernel_Reduce_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z36ncclKernel_Reduce_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Sum_doublev 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Sum_doublev 512 bytes stack frame, 400 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Sum_doublev 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z41ncclKernel_Reduce_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z41ncclKernel_Reduce_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z36ncclKernel_Reduce_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z45ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z46ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z36ncclKernel_Reduce_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z36ncclKernel_Reduce_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Sum_doublev 264 bytes stack frame, 280 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Sum_doublev 496 bytes stack frame, 360 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Sum_doublev 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z41ncclKernel_Reduce_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z41ncclKernel_Reduce_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z36ncclKernel_Reduce_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z45ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z45ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z46ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z46ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z36ncclKernel_Reduce_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z36ncclKernel_Reduce_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Sum_doublev 216 bytes stack frame, 212 bytes spill stores, 212 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Sum_doublev 472 bytes stack frame, 312 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Sum_doublev 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z40ncclKernel_Reduce_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z40ncclKernel_Reduce_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z35ncclKernel_Reduce_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z35ncclKernel_Reduce_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z44ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z45ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z35ncclKernel_Reduce_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z35ncclKernel_Reduce_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z35ncclKernel_Reduce_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z35ncclKernel_Reduce_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_TREE_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_NVLS_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_SIMPLE_Sum_floatv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL128_Sum_floatv 456 bytes stack frame, 328 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_RING_LL_Sum_floatv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_TREE_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z40ncclKernel_Reduce_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z40ncclKernel_Reduce_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z35ncclKernel_Reduce_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z35ncclKernel_Reduce_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z44ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z45ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z35ncclKernel_Reduce_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z35ncclKernel_Reduce_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z35ncclKernel_Reduce_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z35ncclKernel_Reduce_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_TREE_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_NVLS_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_SIMPLE_Sum_floatv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL128_Sum_floatv 448 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_RING_LL_Sum_floatv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_TREE_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z40ncclKernel_Reduce_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z40ncclKernel_Reduce_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z35ncclKernel_Reduce_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z35ncclKernel_Reduce_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z44ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z45ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z35ncclKernel_Reduce_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z35ncclKernel_Reduce_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z35ncclKernel_Reduce_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z35ncclKernel_Reduce_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_TREE_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_NVLS_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_SIMPLE_Sum_floatv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL128_Sum_floatv 448 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_RING_LL_Sum_floatv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_TREE_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z40ncclKernel_Reduce_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z40ncclKernel_Reduce_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z35ncclKernel_Reduce_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z35ncclKernel_Reduce_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z44ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z45ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z35ncclKernel_Reduce_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z35ncclKernel_Reduce_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 82 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z35ncclKernel_Reduce_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z35ncclKernel_Reduce_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_TREE_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_NVLS_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_SIMPLE_Sum_floatv 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL128_Sum_floatv 512 bytes stack frame, 396 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_RING_LL_Sum_floatv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_TREE_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z40ncclKernel_Reduce_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z40ncclKernel_Reduce_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z35ncclKernel_Reduce_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z35ncclKernel_Reduce_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z44ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z45ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z35ncclKernel_Reduce_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z35ncclKernel_Reduce_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 84 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z35ncclKernel_Reduce_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z35ncclKernel_Reduce_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_TREE_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_NVLS_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_SIMPLE_Sum_floatv 304 bytes stack frame, 320 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL128_Sum_floatv 504 bytes stack frame, 396 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_RING_LL_Sum_floatv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_TREE_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z40ncclKernel_Reduce_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z40ncclKernel_Reduce_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z35ncclKernel_Reduce_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z35ncclKernel_Reduce_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z44ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z44ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z45ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z45ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z35ncclKernel_Reduce_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z35ncclKernel_Reduce_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers ptxas info : Compiling entry function '_Z35ncclKernel_Reduce_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z35ncclKernel_Reduce_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_TREE_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_NVLS_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_SIMPLE_Sum_floatv 256 bytes stack frame, 272 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL128_Sum_floatv 496 bytes stack frame, 356 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_RING_LL_Sum_floatv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_TREE_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z37ncclKernel_Reduce_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z37ncclKernel_Reduce_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 344 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z37ncclKernel_Reduce_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Sum_uint8_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Sum_uint8_tv 448 bytes stack frame, 312 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Sum_uint8_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z37ncclKernel_Reduce_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z37ncclKernel_Reduce_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 344 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z37ncclKernel_Reduce_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Sum_uint8_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Sum_uint8_tv 448 bytes stack frame, 312 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Sum_uint8_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z37ncclKernel_Reduce_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z37ncclKernel_Reduce_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 344 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z37ncclKernel_Reduce_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Sum_uint8_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Sum_uint8_tv 448 bytes stack frame, 312 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Sum_uint8_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z37ncclKernel_Reduce_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z37ncclKernel_Reduce_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 79 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z37ncclKernel_Reduce_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Sum_uint8_tv 208 bytes stack frame, 208 bytes spill stores, 208 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Sum_uint8_tv 504 bytes stack frame, 388 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Sum_uint8_tv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z37ncclKernel_Reduce_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z37ncclKernel_Reduce_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 81 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z37ncclKernel_Reduce_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Sum_uint8_tv 368 bytes stack frame, 460 bytes spill stores, 820 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Sum_uint8_tv 496 bytes stack frame, 360 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Sum_uint8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z42ncclKernel_Reduce_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z37ncclKernel_Reduce_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z46ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z47ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z37ncclKernel_Reduce_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers ptxas info : Compiling entry function '_Z37ncclKernel_Reduce_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z37ncclKernel_Reduce_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Sum_uint8_tv 352 bytes stack frame, 452 bytes spill stores, 744 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Sum_uint8_tv 480 bytes stack frame, 324 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Sum_uint8_tv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z41ncclKernel_Reduce_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z41ncclKernel_Reduce_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z36ncclKernel_Reduce_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z45ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z46ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z36ncclKernel_Reduce_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 344 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z36ncclKernel_Reduce_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Sum_int8_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Sum_int8_tv 448 bytes stack frame, 312 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Sum_int8_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z41ncclKernel_Reduce_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z41ncclKernel_Reduce_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z36ncclKernel_Reduce_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z45ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z46ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z36ncclKernel_Reduce_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 344 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z36ncclKernel_Reduce_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Sum_int8_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Sum_int8_tv 448 bytes stack frame, 312 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Sum_int8_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z41ncclKernel_Reduce_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z41ncclKernel_Reduce_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z36ncclKernel_Reduce_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z45ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z46ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z36ncclKernel_Reduce_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 344 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z36ncclKernel_Reduce_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Sum_int8_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Sum_int8_tv 448 bytes stack frame, 312 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Sum_int8_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z41ncclKernel_Reduce_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z41ncclKernel_Reduce_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z36ncclKernel_Reduce_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z45ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z46ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z36ncclKernel_Reduce_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 79 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z36ncclKernel_Reduce_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Sum_int8_tv 208 bytes stack frame, 208 bytes spill stores, 208 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Sum_int8_tv 504 bytes stack frame, 388 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Sum_int8_tv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z41ncclKernel_Reduce_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z41ncclKernel_Reduce_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z36ncclKernel_Reduce_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z45ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z46ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z36ncclKernel_Reduce_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 81 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z36ncclKernel_Reduce_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Sum_int8_tv 368 bytes stack frame, 460 bytes spill stores, 820 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Sum_int8_tv 496 bytes stack frame, 360 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Sum_int8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z41ncclKernel_Reduce_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z41ncclKernel_Reduce_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z36ncclKernel_Reduce_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z45ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z45ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z46ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z46ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z36ncclKernel_Reduce_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers ptxas info : Compiling entry function '_Z36ncclKernel_Reduce_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z36ncclKernel_Reduce_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Sum_int8_tv 352 bytes stack frame, 452 bytes spill stores, 744 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Sum_int8_tv 480 bytes stack frame, 324 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Sum_int8_tv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Prod_int32_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Prod_int32_tv 464 bytes stack frame, 344 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Prod_int32_tv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Prod_int32_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Prod_int32_tv 456 bytes stack frame, 332 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Prod_int32_tv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Prod_int32_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Prod_int32_tv 456 bytes stack frame, 332 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Prod_int32_tv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Prod_int32_tv 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Prod_int32_tv 512 bytes stack frame, 396 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Prod_int32_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Prod_int32_tv 304 bytes stack frame, 320 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Prod_int32_tv 504 bytes stack frame, 396 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Prod_int32_tv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Prod_int32_tv 256 bytes stack frame, 272 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Prod_int32_tv 496 bytes stack frame, 356 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Prod_int32_tv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z39ncclKernel_Reduce_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclKernel_Reduce_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z34ncclKernel_Reduce_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z34ncclKernel_Reduce_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z43ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z44ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z34ncclKernel_Reduce_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z34ncclKernel_Reduce_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z34ncclKernel_Reduce_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z34ncclKernel_Reduce_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_TREE_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_NVLS_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_SIMPLE_Sum_halfv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL128_Sum_halfv 464 bytes stack frame, 340 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_RING_LL_Sum_halfv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_TREE_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z39ncclKernel_Reduce_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclKernel_Reduce_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z34ncclKernel_Reduce_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z34ncclKernel_Reduce_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z43ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z44ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z34ncclKernel_Reduce_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z34ncclKernel_Reduce_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z34ncclKernel_Reduce_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z34ncclKernel_Reduce_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_TREE_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_NVLS_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_SIMPLE_Sum_halfv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL128_Sum_halfv 488 bytes stack frame, 360 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_RING_LL_Sum_halfv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_TREE_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z39ncclKernel_Reduce_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclKernel_Reduce_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z34ncclKernel_Reduce_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z34ncclKernel_Reduce_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z43ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z44ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z34ncclKernel_Reduce_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z34ncclKernel_Reduce_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z34ncclKernel_Reduce_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z34ncclKernel_Reduce_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_TREE_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_NVLS_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_SIMPLE_Sum_halfv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL128_Sum_halfv 464 bytes stack frame, 340 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_RING_LL_Sum_halfv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_TREE_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z39ncclKernel_Reduce_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclKernel_Reduce_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z34ncclKernel_Reduce_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z34ncclKernel_Reduce_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z43ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z44ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z34ncclKernel_Reduce_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z34ncclKernel_Reduce_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 78 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z34ncclKernel_Reduce_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z34ncclKernel_Reduce_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_TREE_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_NVLS_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_SIMPLE_Sum_halfv 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL128_Sum_halfv 512 bytes stack frame, 404 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_RING_LL_Sum_halfv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_TREE_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z39ncclKernel_Reduce_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclKernel_Reduce_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z34ncclKernel_Reduce_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z34ncclKernel_Reduce_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z43ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z44ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z34ncclKernel_Reduce_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z34ncclKernel_Reduce_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 79 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z34ncclKernel_Reduce_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z34ncclKernel_Reduce_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_TREE_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_NVLS_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_SIMPLE_Sum_halfv 256 bytes stack frame, 268 bytes spill stores, 276 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL128_Sum_halfv 512 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_RING_LL_Sum_halfv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_TREE_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z39ncclKernel_Reduce_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclKernel_Reduce_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z34ncclKernel_Reduce_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z34ncclKernel_Reduce_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z43ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z44ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z44ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z34ncclKernel_Reduce_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z34ncclKernel_Reduce_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers ptxas info : Compiling entry function '_Z34ncclKernel_Reduce_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z34ncclKernel_Reduce_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_TREE_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_NVLS_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_SIMPLE_Sum_halfv 232 bytes stack frame, 252 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL128_Sum_halfv 504 bytes stack frame, 372 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_RING_LL_Sum_halfv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_TREE_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_SIMPLE_Prod_uint64_tv 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_LL128_Prod_uint64_tv 464 bytes stack frame, 340 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL_Prod_uint64_tv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_SIMPLE_Prod_uint64_tv 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_LL128_Prod_uint64_tv 456 bytes stack frame, 328 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL_Prod_uint64_tv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_SIMPLE_Prod_uint64_tv 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_LL128_Prod_uint64_tv 456 bytes stack frame, 328 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL_Prod_uint64_tv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_SIMPLE_Prod_uint64_tv 208 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_LL128_Prod_uint64_tv 512 bytes stack frame, 408 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL_Prod_uint64_tv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_SIMPLE_Prod_uint64_tv 328 bytes stack frame, 360 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_LL128_Prod_uint64_tv 496 bytes stack frame, 372 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL_Prod_uint64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_SIMPLE_Prod_uint64_tv 256 bytes stack frame, 272 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_LL128_Prod_uint64_tv 472 bytes stack frame, 312 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL_Prod_uint64_tv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_SIMPLE_Prod_uint32_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_LL128_Prod_uint32_tv 464 bytes stack frame, 344 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL_Prod_uint32_tv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_SIMPLE_Prod_uint32_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_LL128_Prod_uint32_tv 456 bytes stack frame, 332 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL_Prod_uint32_tv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_SIMPLE_Prod_uint32_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_LL128_Prod_uint32_tv 456 bytes stack frame, 332 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL_Prod_uint32_tv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_SIMPLE_Prod_uint32_tv 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_LL128_Prod_uint32_tv 512 bytes stack frame, 396 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL_Prod_uint32_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_SIMPLE_Prod_uint32_tv 304 bytes stack frame, 320 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_LL128_Prod_uint32_tv 504 bytes stack frame, 396 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL_Prod_uint32_tv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_SIMPLE_Prod_uint32_tv 256 bytes stack frame, 272 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_LL128_Prod_uint32_tv 496 bytes stack frame, 356 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL_Prod_uint32_tv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Prod_doublev 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Prod_doublev 456 bytes stack frame, 332 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Prod_doublev 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Prod_doublev 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Prod_doublev 456 bytes stack frame, 320 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Prod_doublev 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Prod_doublev 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Prod_doublev 456 bytes stack frame, 320 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Prod_doublev 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Prod_doublev 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Prod_doublev 512 bytes stack frame, 400 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Prod_doublev 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Prod_doublev 264 bytes stack frame, 280 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Prod_doublev 496 bytes stack frame, 360 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Prod_doublev 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Prod_doublev 216 bytes stack frame, 212 bytes spill stores, 212 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Prod_doublev 472 bytes stack frame, 312 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Prod_doublev 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Prod_int64_tv 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Prod_int64_tv 464 bytes stack frame, 340 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Prod_int64_tv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Prod_int64_tv 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Prod_int64_tv 456 bytes stack frame, 328 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Prod_int64_tv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Prod_int64_tv 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Prod_int64_tv 456 bytes stack frame, 328 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Prod_int64_tv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Prod_int64_tv 208 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Prod_int64_tv 512 bytes stack frame, 408 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Prod_int64_tv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Prod_int64_tv 328 bytes stack frame, 360 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Prod_int64_tv 496 bytes stack frame, 372 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Prod_int64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Prod_int64_tv 256 bytes stack frame, 272 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Prod_int64_tv 472 bytes stack frame, 312 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Prod_int64_tv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Min_int32_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Min_int32_tv 456 bytes stack frame, 328 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Min_int32_tv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Min_int32_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Min_int32_tv 448 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Min_int32_tv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Min_int32_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Min_int32_tv 448 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Min_int32_tv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Min_int32_tv 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Min_int32_tv 512 bytes stack frame, 396 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Min_int32_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Min_int32_tv 304 bytes stack frame, 320 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Min_int32_tv 504 bytes stack frame, 396 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Min_int32_tv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Min_int32_tv 256 bytes stack frame, 272 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Min_int32_tv 496 bytes stack frame, 356 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Min_int32_tv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Prod_floatv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Prod_floatv 456 bytes stack frame, 328 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Prod_floatv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Prod_floatv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Prod_floatv 448 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Prod_floatv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Prod_floatv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Prod_floatv 448 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Prod_floatv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Prod_floatv 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Prod_floatv 512 bytes stack frame, 396 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Prod_floatv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Prod_floatv 304 bytes stack frame, 320 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Prod_floatv 504 bytes stack frame, 396 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Prod_floatv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Prod_floatv 256 bytes stack frame, 272 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Prod_floatv 496 bytes stack frame, 356 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Prod_floatv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Prod_int8_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Prod_int8_tv 456 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Prod_int8_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Prod_int8_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Prod_int8_tv 456 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Prod_int8_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Prod_int8_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Prod_int8_tv 456 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Prod_int8_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Prod_int8_tv 208 bytes stack frame, 208 bytes spill stores, 208 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Prod_int8_tv 520 bytes stack frame, 392 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Prod_int8_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Prod_int8_tv 368 bytes stack frame, 452 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Prod_int8_tv 512 bytes stack frame, 380 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Prod_int8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Prod_int8_tv 336 bytes stack frame, 436 bytes spill stores, 772 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Prod_int8_tv 504 bytes stack frame, 384 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Prod_int8_tv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Prod_uint8_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Prod_uint8_tv 456 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Prod_uint8_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Prod_uint8_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Prod_uint8_tv 456 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Prod_uint8_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Prod_uint8_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Prod_uint8_tv 456 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Prod_uint8_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Prod_uint8_tv 208 bytes stack frame, 208 bytes spill stores, 208 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Prod_uint8_tv 520 bytes stack frame, 392 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Prod_uint8_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Prod_uint8_tv 368 bytes stack frame, 452 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Prod_uint8_tv 512 bytes stack frame, 380 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Prod_uint8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Prod_uint8_tv 336 bytes stack frame, 436 bytes spill stores, 772 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Prod_uint8_tv 504 bytes stack frame, 384 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Prod_uint8_tv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Min_uint32_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Min_uint32_tv 456 bytes stack frame, 328 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Min_uint32_tv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Min_uint32_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Min_uint32_tv 448 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Min_uint32_tv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Min_uint32_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Min_uint32_tv 448 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Min_uint32_tv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Min_uint32_tv 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Min_uint32_tv 512 bytes stack frame, 396 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Min_uint32_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Min_uint32_tv 304 bytes stack frame, 320 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Min_uint32_tv 504 bytes stack frame, 396 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Min_uint32_tv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Min_uint32_tv 256 bytes stack frame, 272 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Min_uint32_tv 496 bytes stack frame, 356 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Min_uint32_tv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_TREE_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_NVLS_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_SIMPLE_Prod_halfv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL128_Prod_halfv 464 bytes stack frame, 340 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_RING_LL_Prod_halfv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_TREE_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_TREE_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_NVLS_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_SIMPLE_Prod_halfv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL128_Prod_halfv 488 bytes stack frame, 360 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_RING_LL_Prod_halfv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_TREE_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_TREE_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_NVLS_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_SIMPLE_Prod_halfv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL128_Prod_halfv 464 bytes stack frame, 340 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_RING_LL_Prod_halfv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_TREE_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_TREE_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_NVLS_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_SIMPLE_Prod_halfv 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL128_Prod_halfv 512 bytes stack frame, 404 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_RING_LL_Prod_halfv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_TREE_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_TREE_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_NVLS_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_SIMPLE_Prod_halfv 256 bytes stack frame, 268 bytes spill stores, 276 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL128_Prod_halfv 512 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_RING_LL_Prod_halfv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_TREE_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_TREE_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_NVLS_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_SIMPLE_Prod_halfv 232 bytes stack frame, 252 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL128_Prod_halfv 504 bytes stack frame, 372 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_RING_LL_Prod_halfv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_TREE_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Min_int8_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Min_int8_tv 456 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Min_int8_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Min_int8_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Min_int8_tv 456 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Min_int8_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Min_int8_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Min_int8_tv 456 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Min_int8_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Min_int8_tv 288 bytes stack frame, 320 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Min_int8_tv 520 bytes stack frame, 392 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Min_int8_tv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Min_int8_tv 368 bytes stack frame, 452 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Min_int8_tv 512 bytes stack frame, 380 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Min_int8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Min_int8_tv 336 bytes stack frame, 432 bytes spill stores, 768 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Min_int8_tv 512 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Min_int8_tv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z48ncclKernel_Reduce_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z48ncclKernel_Reduce_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z43ncclKernel_Reduce_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z52ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z52ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z53ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z53ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z43ncclKernel_Reduce_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z43ncclKernel_Reduce_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_Sum___nv_bfloat16v 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_Sum___nv_bfloat16v 480 bytes stack frame, 352 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_Sum___nv_bfloat16v 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z48ncclKernel_Reduce_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z48ncclKernel_Reduce_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z43ncclKernel_Reduce_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z52ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z52ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z53ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z53ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z43ncclKernel_Reduce_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z43ncclKernel_Reduce_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_Sum___nv_bfloat16v 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_Sum___nv_bfloat16v 480 bytes stack frame, 352 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_Sum___nv_bfloat16v 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z48ncclKernel_Reduce_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z48ncclKernel_Reduce_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z43ncclKernel_Reduce_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z52ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z52ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z53ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z53ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z43ncclKernel_Reduce_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 75 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z43ncclKernel_Reduce_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_Sum___nv_bfloat16v 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_Sum___nv_bfloat16v 480 bytes stack frame, 352 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_Sum___nv_bfloat16v 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z48ncclKernel_Reduce_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z48ncclKernel_Reduce_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z43ncclKernel_Reduce_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z52ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z52ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z53ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z53ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z43ncclKernel_Reduce_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 78 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z43ncclKernel_Reduce_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_Sum___nv_bfloat16v 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_Sum___nv_bfloat16v 504 bytes stack frame, 396 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_Sum___nv_bfloat16v 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z48ncclKernel_Reduce_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z48ncclKernel_Reduce_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z43ncclKernel_Reduce_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z52ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z52ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z53ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z53ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z43ncclKernel_Reduce_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 79 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z43ncclKernel_Reduce_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_Sum___nv_bfloat16v 256 bytes stack frame, 268 bytes spill stores, 276 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_Sum___nv_bfloat16v 512 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_Sum___nv_bfloat16v 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z48ncclKernel_Reduce_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z48ncclKernel_Reduce_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z43ncclKernel_Reduce_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z52ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z52ncclKernel_Reduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z53ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z53ncclKernel_Reduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z43ncclKernel_Reduce_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers ptxas info : Compiling entry function '_Z43ncclKernel_Reduce_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z43ncclKernel_Reduce_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_Sum___nv_bfloat16v 232 bytes stack frame, 252 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_Sum___nv_bfloat16v 504 bytes stack frame, 372 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_Sum___nv_bfloat16v 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Min_uint8_tv 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Min_uint8_tv 456 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Min_uint8_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Min_uint8_tv 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Min_uint8_tv 456 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Min_uint8_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Min_uint8_tv 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Min_uint8_tv 456 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Min_uint8_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Min_uint8_tv 288 bytes stack frame, 320 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Min_uint8_tv 520 bytes stack frame, 392 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Min_uint8_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Min_uint8_tv 368 bytes stack frame, 452 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Min_uint8_tv 512 bytes stack frame, 380 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Min_uint8_tv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Min_uint8_tv 336 bytes stack frame, 436 bytes spill stores, 772 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Min_uint8_tv 512 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Min_uint8_tv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_Prod___nv_bfloat16v 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_Prod___nv_bfloat16v 480 bytes stack frame, 352 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_Prod___nv_bfloat16v 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_Prod___nv_bfloat16v 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_Prod___nv_bfloat16v 480 bytes stack frame, 352 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_Prod___nv_bfloat16v 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_Prod___nv_bfloat16v 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_Prod___nv_bfloat16v 480 bytes stack frame, 352 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_Prod___nv_bfloat16v 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_Prod___nv_bfloat16v 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_Prod___nv_bfloat16v 504 bytes stack frame, 396 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_Prod___nv_bfloat16v 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_Prod___nv_bfloat16v 256 bytes stack frame, 268 bytes spill stores, 276 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_Prod___nv_bfloat16v 512 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_Prod___nv_bfloat16v 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_Prod___nv_bfloat16v 232 bytes stack frame, 252 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_Prod___nv_bfloat16v 504 bytes stack frame, 372 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_Prod___nv_bfloat16v 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Min_int64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Min_int64_tv 448 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Min_int64_tv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Min_int64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Min_int64_tv 448 bytes stack frame, 300 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Min_int64_tv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Min_int64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Min_int64_tv 448 bytes stack frame, 300 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Min_int64_tv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Min_int64_tv 208 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Min_int64_tv 512 bytes stack frame, 408 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Min_int64_tv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Min_int64_tv 312 bytes stack frame, 348 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Min_int64_tv 504 bytes stack frame, 396 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Min_int64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Min_int64_tv 256 bytes stack frame, 268 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Min_int64_tv 488 bytes stack frame, 336 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Min_int64_tv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Min_uint64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Min_uint64_tv 448 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Min_uint64_tv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Min_uint64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Min_uint64_tv 448 bytes stack frame, 300 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Min_uint64_tv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Min_uint64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Min_uint64_tv 448 bytes stack frame, 300 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Min_uint64_tv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Min_uint64_tv 208 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Min_uint64_tv 512 bytes stack frame, 408 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Min_uint64_tv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Min_uint64_tv 312 bytes stack frame, 348 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Min_uint64_tv 504 bytes stack frame, 396 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Min_uint64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Min_uint64_tv 256 bytes stack frame, 268 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Min_uint64_tv 488 bytes stack frame, 336 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Min_uint64_tv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Min_doublev 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Min_doublev 456 bytes stack frame, 332 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Min_doublev 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Min_doublev 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Min_doublev 456 bytes stack frame, 320 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Min_doublev 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Min_doublev 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Min_doublev 456 bytes stack frame, 320 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Min_doublev 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Min_doublev 208 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Min_doublev 512 bytes stack frame, 400 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Min_doublev 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Min_doublev 328 bytes stack frame, 360 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Min_doublev 496 bytes stack frame, 360 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Min_doublev 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Min_doublev 264 bytes stack frame, 296 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Min_doublev 472 bytes stack frame, 304 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Min_doublev 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_TREE_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_NVLS_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_SIMPLE_Min_floatv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL128_Min_floatv 456 bytes stack frame, 328 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_RING_LL_Min_floatv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_TREE_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_TREE_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_NVLS_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_SIMPLE_Min_floatv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL128_Min_floatv 448 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_RING_LL_Min_floatv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_TREE_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_TREE_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_NVLS_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_SIMPLE_Min_floatv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL128_Min_floatv 448 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_RING_LL_Min_floatv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_TREE_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_TREE_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_NVLS_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_SIMPLE_Min_floatv 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL128_Min_floatv 512 bytes stack frame, 396 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_RING_LL_Min_floatv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_TREE_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_TREE_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_NVLS_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_SIMPLE_Min_floatv 304 bytes stack frame, 320 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL128_Min_floatv 504 bytes stack frame, 396 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_RING_LL_Min_floatv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_TREE_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_TREE_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_NVLS_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_SIMPLE_Min_floatv 256 bytes stack frame, 272 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL128_Min_floatv 496 bytes stack frame, 356 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_RING_LL_Min_floatv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_TREE_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Max_uint32_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Max_uint32_tv 456 bytes stack frame, 328 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Max_uint32_tv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Max_uint32_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Max_uint32_tv 448 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Max_uint32_tv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Max_uint32_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Max_uint32_tv 448 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Max_uint32_tv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Max_uint32_tv 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Max_uint32_tv 512 bytes stack frame, 396 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Max_uint32_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Max_uint32_tv 304 bytes stack frame, 320 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Max_uint32_tv 504 bytes stack frame, 396 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Max_uint32_tv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Max_uint32_tv 256 bytes stack frame, 272 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Max_uint32_tv 496 bytes stack frame, 356 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Max_uint32_tv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_TREE_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_NVLS_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_SIMPLE_Min_halfv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL128_Min_halfv 488 bytes stack frame, 360 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_RING_LL_Min_halfv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_TREE_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_TREE_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_NVLS_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_SIMPLE_Min_halfv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL128_Min_halfv 464 bytes stack frame, 340 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_RING_LL_Min_halfv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_TREE_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_TREE_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_NVLS_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_SIMPLE_Min_halfv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL128_Min_halfv 488 bytes stack frame, 360 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_RING_LL_Min_halfv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_TREE_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_TREE_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_NVLS_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_SIMPLE_Min_halfv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL128_Min_halfv 512 bytes stack frame, 400 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_RING_LL_Min_halfv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_TREE_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_TREE_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_NVLS_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_SIMPLE_Min_halfv 256 bytes stack frame, 268 bytes spill stores, 276 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL128_Min_halfv 512 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_RING_LL_Min_halfv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_TREE_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_TREE_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_NVLS_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_COLLNET_CHAIN_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_DIRECT_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_SIMPLE_Min_halfv 232 bytes stack frame, 252 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL128_Min_halfv 504 bytes stack frame, 372 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_RING_LL_Min_halfv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_TREE_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Max_int32_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Max_int32_tv 456 bytes stack frame, 328 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Max_int32_tv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Max_int32_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Max_int32_tv 448 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Max_int32_tv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Max_int32_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Max_int32_tv 448 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Max_int32_tv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Max_int32_tv 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Max_int32_tv 512 bytes stack frame, 396 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Max_int32_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Max_int32_tv 304 bytes stack frame, 320 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Max_int32_tv 504 bytes stack frame, 396 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Max_int32_tv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Max_int32_tv 256 bytes stack frame, 272 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Max_int32_tv 496 bytes stack frame, 356 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Max_int32_tv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Max_int64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Max_int64_tv 448 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Max_int64_tv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Max_int64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Max_int64_tv 448 bytes stack frame, 300 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Max_int64_tv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Max_int64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Max_int64_tv 448 bytes stack frame, 300 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Max_int64_tv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Max_int64_tv 208 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Max_int64_tv 512 bytes stack frame, 408 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Max_int64_tv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Max_int64_tv 312 bytes stack frame, 348 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Max_int64_tv 504 bytes stack frame, 396 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Max_int64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Max_int64_tv 256 bytes stack frame, 268 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Max_int64_tv 488 bytes stack frame, 336 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Max_int64_tv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Max_uint64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Max_uint64_tv 448 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Max_uint64_tv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Max_uint64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Max_uint64_tv 448 bytes stack frame, 300 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Max_uint64_tv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Max_uint64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Max_uint64_tv 448 bytes stack frame, 300 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Max_uint64_tv 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Max_uint64_tv 208 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Max_uint64_tv 512 bytes stack frame, 408 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Max_uint64_tv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Max_uint64_tv 312 bytes stack frame, 348 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Max_uint64_tv 504 bytes stack frame, 396 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Max_uint64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_SIMPLE_Max_uint64_tv 256 bytes stack frame, 268 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL128_Max_uint64_tv 488 bytes stack frame, 336 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL_Max_uint64_tv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Max_doublev 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Max_doublev 456 bytes stack frame, 332 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Max_doublev 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Max_doublev 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Max_doublev 456 bytes stack frame, 320 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Max_doublev 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Max_doublev 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Max_doublev 456 bytes stack frame, 320 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Max_doublev 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Max_doublev 208 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Max_doublev 512 bytes stack frame, 400 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Max_doublev 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Max_doublev 328 bytes stack frame, 360 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Max_doublev 496 bytes stack frame, 360 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Max_doublev 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Max_doublev 264 bytes stack frame, 296 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Max_doublev 472 bytes stack frame, 304 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Max_doublev 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_TREE_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_NVLS_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_SIMPLE_Max_floatv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL128_Max_floatv 456 bytes stack frame, 328 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_RING_LL_Max_floatv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_TREE_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_TREE_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_NVLS_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_SIMPLE_Max_floatv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL128_Max_floatv 448 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_RING_LL_Max_floatv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_TREE_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_TREE_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_NVLS_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_SIMPLE_Max_floatv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL128_Max_floatv 448 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_RING_LL_Max_floatv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_TREE_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_TREE_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_NVLS_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_SIMPLE_Max_floatv 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL128_Max_floatv 512 bytes stack frame, 396 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_RING_LL_Max_floatv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_TREE_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_TREE_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_NVLS_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_SIMPLE_Max_floatv 304 bytes stack frame, 320 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL128_Max_floatv 504 bytes stack frame, 396 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_RING_LL_Max_floatv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_TREE_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_TREE_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_NVLS_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_SIMPLE_Max_floatv 256 bytes stack frame, 272 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_LL128_Max_floatv 496 bytes stack frame, 356 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_RING_LL_Max_floatv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z37ncclFunction_Reduce_TREE_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_TREE_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_NVLS_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_SIMPLE_Max_halfv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL128_Max_halfv 488 bytes stack frame, 360 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_RING_LL_Max_halfv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_TREE_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_TREE_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_NVLS_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_SIMPLE_Max_halfv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL128_Max_halfv 464 bytes stack frame, 340 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_RING_LL_Max_halfv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_TREE_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_TREE_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_NVLS_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_SIMPLE_Max_halfv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL128_Max_halfv 488 bytes stack frame, 360 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_RING_LL_Max_halfv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_TREE_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_TREE_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_NVLS_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_SIMPLE_Max_halfv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL128_Max_halfv 512 bytes stack frame, 400 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_RING_LL_Max_halfv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_TREE_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_TREE_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_NVLS_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_SIMPLE_Max_halfv 256 bytes stack frame, 268 bytes spill stores, 276 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL128_Max_halfv 512 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_RING_LL_Max_halfv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_TREE_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_TREE_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_NVLS_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_NVLS_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_RING_SIMPLE_Max_halfv 232 bytes stack frame, 252 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL128_Max_halfv 504 bytes stack frame, 372 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_RING_LL_Max_halfv 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_Reduce_TREE_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z36ncclFunction_Reduce_TREE_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Max_int8_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Max_int8_tv 456 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Max_int8_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Max_int8_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Max_int8_tv 456 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Max_int8_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Max_int8_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Max_int8_tv 456 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Max_int8_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Max_int8_tv 288 bytes stack frame, 320 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Max_int8_tv 520 bytes stack frame, 392 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Max_int8_tv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Max_int8_tv 368 bytes stack frame, 452 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Max_int8_tv 512 bytes stack frame, 380 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Max_int8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_TREE_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_TREE_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_NVLS_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_NVLS_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_SIMPLE_Max_int8_tv 336 bytes stack frame, 432 bytes spill stores, 768 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_RING_LL128_Max_int8_tv 512 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_RING_LL_Max_int8_tv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_Reduce_TREE_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z38ncclFunction_Reduce_TREE_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_PreMulSum_int32_tv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_PreMulSum_int32_tv 472 bytes stack frame, 344 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_PreMulSum_int32_tv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_PreMulSum_int32_tv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_PreMulSum_int32_tv 456 bytes stack frame, 344 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_PreMulSum_int32_tv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_PreMulSum_int32_tv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_PreMulSum_int32_tv 456 bytes stack frame, 344 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_PreMulSum_int32_tv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_PreMulSum_int32_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_PreMulSum_int32_tv 512 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_PreMulSum_int32_tv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_PreMulSum_int32_tv 312 bytes stack frame, 324 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_PreMulSum_int32_tv 520 bytes stack frame, 416 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_PreMulSum_int32_tv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_PreMulSum_int32_tv 264 bytes stack frame, 276 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_PreMulSum_int32_tv 488 bytes stack frame, 344 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_PreMulSum_int32_tv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_PreMulSum_uint32_tv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_PreMulSum_uint32_tv 472 bytes stack frame, 344 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_PreMulSum_uint32_tv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_PreMulSum_uint32_tv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_PreMulSum_uint32_tv 456 bytes stack frame, 344 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_PreMulSum_uint32_tv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_PreMulSum_uint32_tv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_PreMulSum_uint32_tv 456 bytes stack frame, 344 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_PreMulSum_uint32_tv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_PreMulSum_uint32_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_PreMulSum_uint32_tv 512 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_PreMulSum_uint32_tv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_PreMulSum_uint32_tv 312 bytes stack frame, 324 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_PreMulSum_uint32_tv 520 bytes stack frame, 416 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_PreMulSum_uint32_tv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_PreMulSum_uint32_tv 264 bytes stack frame, 276 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_PreMulSum_uint32_tv 488 bytes stack frame, 344 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_PreMulSum_uint32_tv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Max_uint8_tv 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Max_uint8_tv 456 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Max_uint8_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Max_uint8_tv 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Max_uint8_tv 456 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Max_uint8_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Max_uint8_tv 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Max_uint8_tv 456 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Max_uint8_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Max_uint8_tv 288 bytes stack frame, 320 bytes spill stores, 388 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Max_uint8_tv 520 bytes stack frame, 392 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Max_uint8_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Max_uint8_tv 368 bytes stack frame, 452 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Max_uint8_tv 512 bytes stack frame, 380 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Max_uint8_tv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_TREE_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_NVLS_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_COLLNET_CHAIN_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_COLLNET_DIRECT_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_SIMPLE_Max_uint8_tv 336 bytes stack frame, 436 bytes spill stores, 772 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL128_Max_uint8_tv 512 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_RING_LL_Max_uint8_tv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_Reduce_TREE_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_Min___nv_bfloat16v 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_Min___nv_bfloat16v 480 bytes stack frame, 352 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_Min___nv_bfloat16v 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_Min___nv_bfloat16v 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_Min___nv_bfloat16v 480 bytes stack frame, 352 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_Min___nv_bfloat16v 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_Min___nv_bfloat16v 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_Min___nv_bfloat16v 480 bytes stack frame, 352 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_Min___nv_bfloat16v 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_Min___nv_bfloat16v 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_Min___nv_bfloat16v 504 bytes stack frame, 396 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_Min___nv_bfloat16v 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_Min___nv_bfloat16v 256 bytes stack frame, 268 bytes spill stores, 276 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_Min___nv_bfloat16v 512 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_Min___nv_bfloat16v 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_Min___nv_bfloat16v 232 bytes stack frame, 252 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_Min___nv_bfloat16v 504 bytes stack frame, 372 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_Min___nv_bfloat16v 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_PreMulSum_uint64_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_PreMulSum_uint64_tv 480 bytes stack frame, 356 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_PreMulSum_uint64_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_PreMulSum_uint64_tv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_PreMulSum_uint64_tv 472 bytes stack frame, 340 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_PreMulSum_uint64_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_PreMulSum_uint64_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_PreMulSum_uint64_tv 472 bytes stack frame, 340 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_PreMulSum_uint64_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_PreMulSum_uint64_tv 208 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_PreMulSum_uint64_tv 504 bytes stack frame, 396 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_PreMulSum_uint64_tv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_PreMulSum_uint64_tv 328 bytes stack frame, 348 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_PreMulSum_uint64_tv 504 bytes stack frame, 384 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_PreMulSum_uint64_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_PreMulSum_uint64_tv 280 bytes stack frame, 312 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_PreMulSum_uint64_tv 504 bytes stack frame, 376 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_PreMulSum_uint64_tv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_PreMulSum_int64_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_PreMulSum_int64_tv 480 bytes stack frame, 356 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_PreMulSum_int64_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_PreMulSum_int64_tv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_PreMulSum_int64_tv 472 bytes stack frame, 340 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_PreMulSum_int64_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_PreMulSum_int64_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_PreMulSum_int64_tv 472 bytes stack frame, 340 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_PreMulSum_int64_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_PreMulSum_int64_tv 208 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_PreMulSum_int64_tv 504 bytes stack frame, 396 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_PreMulSum_int64_tv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_PreMulSum_int64_tv 328 bytes stack frame, 348 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_PreMulSum_int64_tv 504 bytes stack frame, 384 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_PreMulSum_int64_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_PreMulSum_int64_tv 280 bytes stack frame, 312 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_PreMulSum_int64_tv 504 bytes stack frame, 376 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_PreMulSum_int64_tv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_Max___nv_bfloat16v 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_Max___nv_bfloat16v 480 bytes stack frame, 352 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_Max___nv_bfloat16v 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_Max___nv_bfloat16v 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_Max___nv_bfloat16v 480 bytes stack frame, 352 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_Max___nv_bfloat16v 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_Max___nv_bfloat16v 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_Max___nv_bfloat16v 480 bytes stack frame, 352 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_Max___nv_bfloat16v 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_Max___nv_bfloat16v 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_Max___nv_bfloat16v 504 bytes stack frame, 396 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_Max___nv_bfloat16v 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_Max___nv_bfloat16v 256 bytes stack frame, 268 bytes spill stores, 276 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_Max___nv_bfloat16v 512 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_Max___nv_bfloat16v 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_Max___nv_bfloat16v 232 bytes stack frame, 252 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_Max___nv_bfloat16v 504 bytes stack frame, 372 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_Max___nv_bfloat16v 104 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_SIMPLE_PreMulSum_int8_tv 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_RING_LL128_PreMulSum_int8_tv 456 bytes stack frame, 324 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_LL_PreMulSum_int8_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_TREE_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_SIMPLE_PreMulSum_int8_tv 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_RING_LL128_PreMulSum_int8_tv 456 bytes stack frame, 324 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_LL_PreMulSum_int8_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_TREE_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_SIMPLE_PreMulSum_int8_tv 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_RING_LL128_PreMulSum_int8_tv 456 bytes stack frame, 324 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_LL_PreMulSum_int8_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_TREE_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_SIMPLE_PreMulSum_int8_tv 208 bytes stack frame, 208 bytes spill stores, 208 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_RING_LL128_PreMulSum_int8_tv 512 bytes stack frame, 408 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_LL_PreMulSum_int8_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_TREE_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_SIMPLE_PreMulSum_int8_tv 368 bytes stack frame, 448 bytes spill stores, 808 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_RING_LL128_PreMulSum_int8_tv 504 bytes stack frame, 400 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_LL_PreMulSum_int8_tv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_TREE_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_SIMPLE_PreMulSum_int8_tv 360 bytes stack frame, 448 bytes spill stores, 740 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_RING_LL128_PreMulSum_int8_tv 496 bytes stack frame, 368 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_LL_PreMulSum_int8_tv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_TREE_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_PreMulSum_uint8_tv 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_PreMulSum_uint8_tv 456 bytes stack frame, 324 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_PreMulSum_uint8_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_PreMulSum_uint8_tv 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_PreMulSum_uint8_tv 456 bytes stack frame, 324 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_PreMulSum_uint8_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_PreMulSum_uint8_tv 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_PreMulSum_uint8_tv 456 bytes stack frame, 324 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_PreMulSum_uint8_tv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_PreMulSum_uint8_tv 208 bytes stack frame, 208 bytes spill stores, 208 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_PreMulSum_uint8_tv 512 bytes stack frame, 408 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_PreMulSum_uint8_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_PreMulSum_uint8_tv 368 bytes stack frame, 448 bytes spill stores, 808 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_PreMulSum_uint8_tv 504 bytes stack frame, 400 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_PreMulSum_uint8_tv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_PreMulSum_uint8_tv 360 bytes stack frame, 448 bytes spill stores, 740 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_PreMulSum_uint8_tv 496 bytes stack frame, 368 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_PreMulSum_uint8_tv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_RING_SIMPLE_PreMulSum_floatv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL128_PreMulSum_floatv 464 bytes stack frame, 360 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL_PreMulSum_floatv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_RING_SIMPLE_PreMulSum_floatv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL128_PreMulSum_floatv 456 bytes stack frame, 344 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL_PreMulSum_floatv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_RING_SIMPLE_PreMulSum_floatv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL128_PreMulSum_floatv 456 bytes stack frame, 344 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL_PreMulSum_floatv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_RING_SIMPLE_PreMulSum_floatv 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL128_PreMulSum_floatv 512 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL_PreMulSum_floatv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_RING_SIMPLE_PreMulSum_floatv 312 bytes stack frame, 324 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL128_PreMulSum_floatv 520 bytes stack frame, 416 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL_PreMulSum_floatv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_NVLS_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_RING_SIMPLE_PreMulSum_floatv 264 bytes stack frame, 276 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL128_PreMulSum_floatv 488 bytes stack frame, 344 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_RING_LL_PreMulSum_floatv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_Reduce_TREE_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_SIMPLE_PreMulSum_halfv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL128_PreMulSum_halfv 488 bytes stack frame, 364 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL_PreMulSum_halfv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_SIMPLE_PreMulSum_halfv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL128_PreMulSum_halfv 472 bytes stack frame, 348 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL_PreMulSum_halfv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_SIMPLE_PreMulSum_halfv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL128_PreMulSum_halfv 488 bytes stack frame, 364 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL_PreMulSum_halfv 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_SIMPLE_PreMulSum_halfv 208 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL128_PreMulSum_halfv 512 bytes stack frame, 388 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL_PreMulSum_halfv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_SIMPLE_PreMulSum_halfv 264 bytes stack frame, 276 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL128_PreMulSum_halfv 512 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL_PreMulSum_halfv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_NVLS_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_SIMPLE_PreMulSum_halfv 232 bytes stack frame, 252 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL128_PreMulSum_halfv 504 bytes stack frame, 388 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_RING_LL_PreMulSum_halfv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_Reduce_TREE_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_SIMPLE_PreMulSum_doublev 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_RING_LL128_PreMulSum_doublev 472 bytes stack frame, 340 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_LL_PreMulSum_doublev 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_TREE_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_SIMPLE_PreMulSum_doublev 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_RING_LL128_PreMulSum_doublev 472 bytes stack frame, 328 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_LL_PreMulSum_doublev 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_TREE_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_SIMPLE_PreMulSum_doublev 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_RING_LL128_PreMulSum_doublev 472 bytes stack frame, 328 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_LL_PreMulSum_doublev 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_TREE_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_SIMPLE_PreMulSum_doublev 208 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_RING_LL128_PreMulSum_doublev 512 bytes stack frame, 396 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_LL_PreMulSum_doublev 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_TREE_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_SIMPLE_PreMulSum_doublev 264 bytes stack frame, 276 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_RING_LL128_PreMulSum_doublev 496 bytes stack frame, 380 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_LL_PreMulSum_doublev 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_TREE_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_NVLS_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_SIMPLE_PreMulSum_doublev 208 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_RING_LL128_PreMulSum_doublev 488 bytes stack frame, 328 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_RING_LL_PreMulSum_doublev 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_TREE_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_Reduce_TREE_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_int64_tv 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_SumPostDiv_int64_tv 472 bytes stack frame, 344 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_SumPostDiv_int64_tv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_int64_tv 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_SumPostDiv_int64_tv 464 bytes stack frame, 324 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_SumPostDiv_int64_tv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_int64_tv 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_SumPostDiv_int64_tv 464 bytes stack frame, 324 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_SumPostDiv_int64_tv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_int64_tv 232 bytes stack frame, 236 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_SumPostDiv_int64_tv 496 bytes stack frame, 388 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_SumPostDiv_int64_tv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_int64_tv 360 bytes stack frame, 384 bytes spill stores, 760 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_SumPostDiv_int64_tv 504 bytes stack frame, 404 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_SumPostDiv_int64_tv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_int64_tv 312 bytes stack frame, 352 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_SumPostDiv_int64_tv 488 bytes stack frame, 368 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_SumPostDiv_int64_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_int32_tv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_SumPostDiv_int32_tv 456 bytes stack frame, 328 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_SumPostDiv_int32_tv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_int32_tv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_SumPostDiv_int32_tv 448 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_SumPostDiv_int32_tv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_int32_tv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_SumPostDiv_int32_tv 448 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_SumPostDiv_int32_tv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_int32_tv 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_SumPostDiv_int32_tv 512 bytes stack frame, 396 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_SumPostDiv_int32_tv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_int32_tv 312 bytes stack frame, 336 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_SumPostDiv_int32_tv 504 bytes stack frame, 396 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_SumPostDiv_int32_tv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_int32_tv 272 bytes stack frame, 304 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_SumPostDiv_int32_tv 496 bytes stack frame, 356 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_SumPostDiv_int32_tv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_uint64_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_LL128_SumPostDiv_uint64_tv 456 bytes stack frame, 336 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_RING_LL_SumPostDiv_uint64_tv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_TREE_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_uint64_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_LL128_SumPostDiv_uint64_tv 456 bytes stack frame, 328 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_RING_LL_SumPostDiv_uint64_tv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_TREE_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_uint64_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_LL128_SumPostDiv_uint64_tv 456 bytes stack frame, 328 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_RING_LL_SumPostDiv_uint64_tv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_TREE_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_uint64_tv 224 bytes stack frame, 220 bytes spill stores, 220 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_LL128_SumPostDiv_uint64_tv 496 bytes stack frame, 380 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_RING_LL_SumPostDiv_uint64_tv 104 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_TREE_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_uint64_tv 320 bytes stack frame, 336 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_LL128_SumPostDiv_uint64_tv 504 bytes stack frame, 372 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_RING_LL_SumPostDiv_uint64_tv 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_TREE_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_uint64_tv 280 bytes stack frame, 308 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_LL128_SumPostDiv_uint64_tv 496 bytes stack frame, 356 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_RING_LL_SumPostDiv_uint64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_TREE_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_uint32_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_LL128_SumPostDiv_uint32_tv 456 bytes stack frame, 328 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_RING_LL_SumPostDiv_uint32_tv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_TREE_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_uint32_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_LL128_SumPostDiv_uint32_tv 448 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_RING_LL_SumPostDiv_uint32_tv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_TREE_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_uint32_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_LL128_SumPostDiv_uint32_tv 448 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_RING_LL_SumPostDiv_uint32_tv 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_TREE_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_uint32_tv 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_LL128_SumPostDiv_uint32_tv 512 bytes stack frame, 396 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_RING_LL_SumPostDiv_uint32_tv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_TREE_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_uint32_tv 312 bytes stack frame, 336 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_LL128_SumPostDiv_uint32_tv 504 bytes stack frame, 396 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_RING_LL_SumPostDiv_uint32_tv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_TREE_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_NVLS_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_uint32_tv 264 bytes stack frame, 292 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_LL128_SumPostDiv_uint32_tv 496 bytes stack frame, 356 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_RING_LL_SumPostDiv_uint32_tv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_Reduce_TREE_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_uint8_tv 232 bytes stack frame, 248 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_SumPostDiv_uint8_tv 488 bytes stack frame, 360 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_SumPostDiv_uint8_tv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_uint8_tv 232 bytes stack frame, 248 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_SumPostDiv_uint8_tv 488 bytes stack frame, 360 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_SumPostDiv_uint8_tv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_uint8_tv 232 bytes stack frame, 248 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_SumPostDiv_uint8_tv 488 bytes stack frame, 360 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_SumPostDiv_uint8_tv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_uint8_tv 272 bytes stack frame, 296 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_SumPostDiv_uint8_tv 520 bytes stack frame, 388 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_SumPostDiv_uint8_tv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_uint8_tv 416 bytes stack frame, 472 bytes spill stores, 888 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_SumPostDiv_uint8_tv 512 bytes stack frame, 388 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_SumPostDiv_uint8_tv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_NVLS_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_uint8_tv 376 bytes stack frame, 484 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_LL128_SumPostDiv_uint8_tv 520 bytes stack frame, 392 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_RING_LL_SumPostDiv_uint8_tv 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_Reduce_TREE_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_int8_tv 232 bytes stack frame, 248 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_SumPostDiv_int8_tv 488 bytes stack frame, 360 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_SumPostDiv_int8_tv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_int8_tv 232 bytes stack frame, 248 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_SumPostDiv_int8_tv 488 bytes stack frame, 360 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_SumPostDiv_int8_tv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_int8_tv 232 bytes stack frame, 248 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_SumPostDiv_int8_tv 488 bytes stack frame, 360 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_SumPostDiv_int8_tv 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_int8_tv 272 bytes stack frame, 288 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_SumPostDiv_int8_tv 520 bytes stack frame, 388 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_SumPostDiv_int8_tv 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_int8_tv 416 bytes stack frame, 472 bytes spill stores, 888 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_SumPostDiv_int8_tv 512 bytes stack frame, 388 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_SumPostDiv_int8_tv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_TREE_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_Reduce_NVLS_TREE_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_Reduce_NVLS_TREE_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_NVLS_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_NVLS_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_NVLS_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_Reduce_COLLNET_CHAIN_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_COLLNET_CHAIN_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_Reduce_COLLNET_DIRECT_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_COLLNET_DIRECT_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_RING_SIMPLE_SumPostDiv_int8_tv 376 bytes stack frame, 484 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_RING_LL128_SumPostDiv_int8_tv 512 bytes stack frame, 384 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_RING_LL_SumPostDiv_int8_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_Reduce_TREE_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_Reduce_TREE_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_Reduce_TREE_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z60ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_RING_SIMPLE_PreMulSum___nv_bfloat16v 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_RING_LL128_PreMulSum___nv_bfloat16v 480 bytes stack frame, 368 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_RING_LL_PreMulSum___nv_bfloat16v 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_TREE_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_TREE_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_TREE_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z60ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_RING_SIMPLE_PreMulSum___nv_bfloat16v 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_RING_LL128_PreMulSum___nv_bfloat16v 480 bytes stack frame, 368 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_RING_LL_PreMulSum___nv_bfloat16v 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_TREE_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_TREE_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_TREE_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z60ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_RING_SIMPLE_PreMulSum___nv_bfloat16v 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_RING_LL128_PreMulSum___nv_bfloat16v 480 bytes stack frame, 368 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_RING_LL_PreMulSum___nv_bfloat16v 96 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_TREE_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_TREE_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_TREE_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z60ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_RING_SIMPLE_PreMulSum___nv_bfloat16v 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_RING_LL128_PreMulSum___nv_bfloat16v 512 bytes stack frame, 436 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_RING_LL_PreMulSum___nv_bfloat16v 120 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_TREE_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_TREE_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_TREE_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z60ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_RING_SIMPLE_PreMulSum___nv_bfloat16v 264 bytes stack frame, 276 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_RING_LL128_PreMulSum___nv_bfloat16v 512 bytes stack frame, 404 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_RING_LL_PreMulSum___nv_bfloat16v 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_TREE_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_TREE_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_TREE_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z60ncclFunction_Reduce_NVLS_TREE_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_Reduce_NVLS_TREE_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_Reduce_NVLS_TREE_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_NVLS_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_NVLS_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_NVLS_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_Reduce_COLLNET_CHAIN_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_Reduce_COLLNET_CHAIN_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_Reduce_COLLNET_CHAIN_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_Reduce_COLLNET_DIRECT_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_Reduce_COLLNET_DIRECT_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_Reduce_COLLNET_DIRECT_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_RING_SIMPLE_PreMulSum___nv_bfloat16v 232 bytes stack frame, 252 bytes spill stores, 236 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_RING_LL128_PreMulSum___nv_bfloat16v 504 bytes stack frame, 388 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_RING_LL_PreMulSum___nv_bfloat16v 112 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_Reduce_TREE_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_Reduce_TREE_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_Reduce_TREE_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 89 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_int32_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Sum_int32_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Sum_int32_tv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Sum_int32_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 88 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_int32_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Sum_int32_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Sum_int32_tv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Sum_int32_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 89 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_int32_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Sum_int32_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Sum_int32_tv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Sum_int32_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_int32_tv 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Sum_int32_tv 304 bytes stack frame, 332 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Sum_int32_tv 552 bytes stack frame, 420 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Sum_int32_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_int32_tv 336 bytes stack frame, 348 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Sum_int32_tv 408 bytes stack frame, 444 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Sum_int32_tv 560 bytes stack frame, 432 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Sum_int32_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_int32_tv 288 bytes stack frame, 300 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Sum_int32_tv 312 bytes stack frame, 340 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Sum_int32_tv 528 bytes stack frame, 400 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Sum_int32_tv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_int64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Sum_int64_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Sum_int64_tv 520 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Sum_int64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_int64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Sum_int64_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Sum_int64_tv 520 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Sum_int64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_int64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Sum_int64_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Sum_int64_tv 520 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Sum_int64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_int64_tv 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Sum_int64_tv 304 bytes stack frame, 332 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Sum_int64_tv 560 bytes stack frame, 436 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Sum_int64_tv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_int64_tv 336 bytes stack frame, 348 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Sum_int64_tv 400 bytes stack frame, 444 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Sum_int64_tv 576 bytes stack frame, 456 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Sum_int64_tv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 90 registers ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_int64_tv 288 bytes stack frame, 300 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Sum_int64_tv 312 bytes stack frame, 340 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Sum_int64_tv 536 bytes stack frame, 416 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Sum_int64_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z54ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z54ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z55ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z55ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_uint64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Sum_uint64_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Sum_uint64_tv 520 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Sum_uint64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z54ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z54ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z55ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z55ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_uint64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Sum_uint64_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Sum_uint64_tv 520 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Sum_uint64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z54ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z54ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z55ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z55ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 85 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_uint64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Sum_uint64_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Sum_uint64_tv 520 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Sum_uint64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z54ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z54ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z55ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z55ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_uint64_tv 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Sum_uint64_tv 304 bytes stack frame, 332 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Sum_uint64_tv 560 bytes stack frame, 436 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Sum_uint64_tv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z54ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z54ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z55ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z55ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_uint64_tv 336 bytes stack frame, 348 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Sum_uint64_tv 400 bytes stack frame, 444 bytes spill stores, 780 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Sum_uint64_tv 576 bytes stack frame, 456 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Sum_uint64_tv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z54ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z54ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z55ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z55ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 90 registers ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_uint64_tv 288 bytes stack frame, 300 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Sum_uint64_tv 312 bytes stack frame, 340 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Sum_uint64_tv 536 bytes stack frame, 416 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Sum_uint64_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z54ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z54ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z55ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z55ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 89 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_uint32_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Sum_uint32_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Sum_uint32_tv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Sum_uint32_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z54ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z54ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z55ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z55ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 88 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_uint32_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Sum_uint32_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Sum_uint32_tv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Sum_uint32_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z54ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z54ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z55ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z55ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 89 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_uint32_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Sum_uint32_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Sum_uint32_tv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Sum_uint32_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z54ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z54ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z55ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z55ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_uint32_tv 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Sum_uint32_tv 304 bytes stack frame, 332 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Sum_uint32_tv 552 bytes stack frame, 420 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Sum_uint32_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z54ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z54ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z55ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z55ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_uint32_tv 336 bytes stack frame, 348 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Sum_uint32_tv 408 bytes stack frame, 444 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Sum_uint32_tv 560 bytes stack frame, 432 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Sum_uint32_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z54ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z54ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z55ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z55ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers ptxas info : Compiling entry function '_Z45ncclKernel_ReduceScatter_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z45ncclKernel_ReduceScatter_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_uint32_tv 288 bytes stack frame, 300 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Sum_uint32_tv 312 bytes stack frame, 340 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Sum_uint32_tv 528 bytes stack frame, 400 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Sum_uint32_tv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_int32_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Prod_int32_tv 224 bytes stack frame, 244 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Prod_int32_tv 520 bytes stack frame, 380 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Prod_int32_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_int32_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Prod_int32_tv 224 bytes stack frame, 244 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Prod_int32_tv 520 bytes stack frame, 380 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Prod_int32_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_int32_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Prod_int32_tv 224 bytes stack frame, 244 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Prod_int32_tv 520 bytes stack frame, 380 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Prod_int32_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_int32_tv 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Prod_int32_tv 304 bytes stack frame, 332 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Prod_int32_tv 552 bytes stack frame, 420 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Prod_int32_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_int32_tv 336 bytes stack frame, 348 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Prod_int32_tv 408 bytes stack frame, 444 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Prod_int32_tv 560 bytes stack frame, 432 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Prod_int32_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_int32_tv 288 bytes stack frame, 300 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Prod_int32_tv 312 bytes stack frame, 340 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Prod_int32_tv 528 bytes stack frame, 400 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Prod_int32_tv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers, 344 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_uint8_tv 272 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Sum_uint8_tv 280 bytes stack frame, 308 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Sum_uint8_tv 496 bytes stack frame, 332 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Sum_uint8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 88 registers, 344 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_uint8_tv 272 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Sum_uint8_tv 280 bytes stack frame, 308 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Sum_uint8_tv 496 bytes stack frame, 332 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Sum_uint8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers, 344 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_uint8_tv 272 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Sum_uint8_tv 280 bytes stack frame, 308 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Sum_uint8_tv 496 bytes stack frame, 332 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Sum_uint8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 92 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_uint8_tv 288 bytes stack frame, 308 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Sum_uint8_tv 336 bytes stack frame, 380 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Sum_uint8_tv 552 bytes stack frame, 412 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Sum_uint8_tv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_uint8_tv 424 bytes stack frame, 492 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Sum_uint8_tv 472 bytes stack frame, 720 bytes spill stores, 1228 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Sum_uint8_tv 528 bytes stack frame, 392 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Sum_uint8_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z49ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z53ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z54ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 91 registers ptxas info : Compiling entry function '_Z44ncclKernel_ReduceScatter_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z44ncclKernel_ReduceScatter_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_uint8_tv 416 bytes stack frame, 556 bytes spill stores, 956 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Sum_uint8_tv 400 bytes stack frame, 520 bytes spill stores, 816 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Sum_uint8_tv 504 bytes stack frame, 360 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Sum_uint8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_uint32_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_SIMPLE_Prod_uint32_tv 224 bytes stack frame, 244 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_LL128_Prod_uint32_tv 520 bytes stack frame, 380 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL_Prod_uint32_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_uint32_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_SIMPLE_Prod_uint32_tv 224 bytes stack frame, 244 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_LL128_Prod_uint32_tv 520 bytes stack frame, 380 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL_Prod_uint32_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_uint32_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_SIMPLE_Prod_uint32_tv 224 bytes stack frame, 244 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_LL128_Prod_uint32_tv 520 bytes stack frame, 380 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL_Prod_uint32_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_uint32_tv 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_SIMPLE_Prod_uint32_tv 304 bytes stack frame, 332 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_LL128_Prod_uint32_tv 552 bytes stack frame, 420 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL_Prod_uint32_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_uint32_tv 336 bytes stack frame, 348 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_SIMPLE_Prod_uint32_tv 408 bytes stack frame, 444 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_LL128_Prod_uint32_tv 560 bytes stack frame, 432 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL_Prod_uint32_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_uint32_tv 288 bytes stack frame, 300 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_SIMPLE_Prod_uint32_tv 312 bytes stack frame, 340 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_LL128_Prod_uint32_tv 528 bytes stack frame, 400 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL_Prod_uint32_tv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_int64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Prod_int64_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Prod_int64_tv 520 bytes stack frame, 396 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Prod_int64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_int64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Prod_int64_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Prod_int64_tv 520 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Prod_int64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_int64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Prod_int64_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Prod_int64_tv 520 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Prod_int64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_int64_tv 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Prod_int64_tv 296 bytes stack frame, 324 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Prod_int64_tv 560 bytes stack frame, 436 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Prod_int64_tv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_int64_tv 336 bytes stack frame, 348 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Prod_int64_tv 416 bytes stack frame, 484 bytes spill stores, 808 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Prod_int64_tv 552 bytes stack frame, 412 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Prod_int64_tv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_int64_tv 288 bytes stack frame, 300 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Prod_int64_tv 312 bytes stack frame, 352 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Prod_int64_tv 512 bytes stack frame, 376 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Prod_int64_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_uint64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_SIMPLE_Prod_uint64_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_LL128_Prod_uint64_tv 520 bytes stack frame, 396 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL_Prod_uint64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_uint64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_SIMPLE_Prod_uint64_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_LL128_Prod_uint64_tv 520 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL_Prod_uint64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_uint64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_SIMPLE_Prod_uint64_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_LL128_Prod_uint64_tv 520 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL_Prod_uint64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_uint64_tv 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_SIMPLE_Prod_uint64_tv 296 bytes stack frame, 324 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_LL128_Prod_uint64_tv 560 bytes stack frame, 436 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL_Prod_uint64_tv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_uint64_tv 336 bytes stack frame, 348 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_SIMPLE_Prod_uint64_tv 416 bytes stack frame, 484 bytes spill stores, 808 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_LL128_Prod_uint64_tv 552 bytes stack frame, 412 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL_Prod_uint64_tv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_uint64_tv 288 bytes stack frame, 300 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_SIMPLE_Prod_uint64_tv 312 bytes stack frame, 352 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_LL128_Prod_uint64_tv 512 bytes stack frame, 376 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL_Prod_uint64_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z47ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z47ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z42ncclKernel_ReduceScatter_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z42ncclKernel_ReduceScatter_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z51ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z51ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z52ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z52ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z42ncclKernel_ReduceScatter_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z42ncclKernel_ReduceScatter_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 89 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z42ncclKernel_ReduceScatter_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z42ncclKernel_ReduceScatter_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_floatv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_NVLS_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_SIMPLE_Sum_floatv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL128_Sum_floatv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_RING_LL_Sum_floatv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_TREE_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z47ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z47ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z42ncclKernel_ReduceScatter_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z42ncclKernel_ReduceScatter_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z51ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z51ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z52ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z52ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z42ncclKernel_ReduceScatter_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z42ncclKernel_ReduceScatter_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 88 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z42ncclKernel_ReduceScatter_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z42ncclKernel_ReduceScatter_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_floatv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_NVLS_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_SIMPLE_Sum_floatv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL128_Sum_floatv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_RING_LL_Sum_floatv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_TREE_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z47ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z47ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z42ncclKernel_ReduceScatter_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z42ncclKernel_ReduceScatter_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z51ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z51ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z52ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z52ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z42ncclKernel_ReduceScatter_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z42ncclKernel_ReduceScatter_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 89 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z42ncclKernel_ReduceScatter_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z42ncclKernel_ReduceScatter_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_floatv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_NVLS_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_SIMPLE_Sum_floatv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL128_Sum_floatv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_RING_LL_Sum_floatv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_TREE_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z47ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z47ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z42ncclKernel_ReduceScatter_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z42ncclKernel_ReduceScatter_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z51ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z51ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z52ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z52ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z42ncclKernel_ReduceScatter_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z42ncclKernel_ReduceScatter_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z42ncclKernel_ReduceScatter_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z42ncclKernel_ReduceScatter_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_floatv 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_NVLS_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_SIMPLE_Sum_floatv 304 bytes stack frame, 332 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL128_Sum_floatv 552 bytes stack frame, 420 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_RING_LL_Sum_floatv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_TREE_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z47ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z47ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z42ncclKernel_ReduceScatter_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z42ncclKernel_ReduceScatter_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z51ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z51ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z52ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z52ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z42ncclKernel_ReduceScatter_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z42ncclKernel_ReduceScatter_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z42ncclKernel_ReduceScatter_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z42ncclKernel_ReduceScatter_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_floatv 336 bytes stack frame, 348 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_NVLS_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_SIMPLE_Sum_floatv 408 bytes stack frame, 444 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL128_Sum_floatv 560 bytes stack frame, 432 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_RING_LL_Sum_floatv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_TREE_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z47ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z47ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z42ncclKernel_ReduceScatter_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z42ncclKernel_ReduceScatter_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z51ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z51ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z52ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z52ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z42ncclKernel_ReduceScatter_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z42ncclKernel_ReduceScatter_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers ptxas info : Compiling entry function '_Z42ncclKernel_ReduceScatter_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z42ncclKernel_ReduceScatter_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_floatv 288 bytes stack frame, 300 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_NVLS_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_SIMPLE_Sum_floatv 312 bytes stack frame, 340 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL128_Sum_floatv 528 bytes stack frame, 400 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_RING_LL_Sum_floatv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_TREE_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z48ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z48ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z52ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z52ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z53ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z53ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers, 344 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_int8_tv 272 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Sum_int8_tv 280 bytes stack frame, 308 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Sum_int8_tv 496 bytes stack frame, 332 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Sum_int8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z48ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z48ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z52ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z52ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z53ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z53ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 88 registers, 344 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_int8_tv 272 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Sum_int8_tv 280 bytes stack frame, 308 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Sum_int8_tv 496 bytes stack frame, 332 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Sum_int8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z48ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z48ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z52ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z52ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z53ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z53ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers, 344 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_int8_tv 272 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Sum_int8_tv 280 bytes stack frame, 308 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Sum_int8_tv 496 bytes stack frame, 332 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Sum_int8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z48ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z48ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z52ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z52ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z53ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z53ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 92 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_int8_tv 288 bytes stack frame, 308 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Sum_int8_tv 336 bytes stack frame, 380 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Sum_int8_tv 552 bytes stack frame, 412 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Sum_int8_tv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z48ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z48ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z52ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z52ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z53ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z53ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_int8_tv 424 bytes stack frame, 492 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Sum_int8_tv 472 bytes stack frame, 720 bytes spill stores, 1228 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Sum_int8_tv 528 bytes stack frame, 392 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Sum_int8_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z48ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z48ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z52ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z52ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z53ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z53ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 91 registers ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_int8_tv 416 bytes stack frame, 556 bytes spill stores, 956 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Sum_int8_tv 400 bytes stack frame, 520 bytes spill stores, 816 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Sum_int8_tv 504 bytes stack frame, 360 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Sum_int8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z48ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z48ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z52ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z52ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z53ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z53ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_doublev 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Sum_doublev 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Sum_doublev 520 bytes stack frame, 380 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Sum_doublev 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z48ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z48ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z52ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z52ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z53ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z53ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_doublev 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Sum_doublev 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Sum_doublev 520 bytes stack frame, 380 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Sum_doublev 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z48ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z48ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z52ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z52ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z53ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z53ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_doublev 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Sum_doublev 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Sum_doublev 520 bytes stack frame, 380 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Sum_doublev 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z48ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z48ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z52ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z52ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z53ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z53ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_doublev 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Sum_doublev 288 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Sum_doublev 560 bytes stack frame, 436 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Sum_doublev 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z48ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z48ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z52ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z52ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z53ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z53ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_doublev 336 bytes stack frame, 348 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Sum_doublev 352 bytes stack frame, 384 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Sum_doublev 560 bytes stack frame, 428 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Sum_doublev 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z48ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z48ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z52ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z52ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z53ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z53ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 91 registers ptxas info : Compiling entry function '_Z43ncclKernel_ReduceScatter_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z43ncclKernel_ReduceScatter_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_doublev 288 bytes stack frame, 300 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Sum_doublev 264 bytes stack frame, 284 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Sum_doublev 520 bytes stack frame, 384 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Sum_doublev 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z46ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z46ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_ReduceScatter_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z41ncclKernel_ReduceScatter_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z51ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z51ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_ReduceScatter_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z41ncclKernel_ReduceScatter_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 88 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z41ncclKernel_ReduceScatter_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z41ncclKernel_ReduceScatter_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_halfv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_NVLS_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_SIMPLE_Sum_halfv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL128_Sum_halfv 520 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_RING_LL_Sum_halfv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_TREE_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z46ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z46ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_ReduceScatter_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z41ncclKernel_ReduceScatter_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z51ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z51ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_ReduceScatter_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z41ncclKernel_ReduceScatter_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 88 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z41ncclKernel_ReduceScatter_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z41ncclKernel_ReduceScatter_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_halfv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_NVLS_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_SIMPLE_Sum_halfv 208 bytes stack frame, 224 bytes spill stores, 216 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL128_Sum_halfv 528 bytes stack frame, 396 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_RING_LL_Sum_halfv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_TREE_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z46ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z46ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_ReduceScatter_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z41ncclKernel_ReduceScatter_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z51ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z51ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_ReduceScatter_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z41ncclKernel_ReduceScatter_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 88 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z41ncclKernel_ReduceScatter_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z41ncclKernel_ReduceScatter_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_halfv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_NVLS_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_SIMPLE_Sum_halfv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL128_Sum_halfv 520 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_RING_LL_Sum_halfv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_TREE_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z46ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z46ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_ReduceScatter_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z41ncclKernel_ReduceScatter_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z51ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z51ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_ReduceScatter_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z41ncclKernel_ReduceScatter_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_ReduceScatter_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z41ncclKernel_ReduceScatter_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_halfv 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_NVLS_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_SIMPLE_Sum_halfv 288 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL128_Sum_halfv 576 bytes stack frame, 468 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_RING_LL_Sum_halfv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_TREE_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z46ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z46ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_ReduceScatter_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z41ncclKernel_ReduceScatter_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z51ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z51ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_ReduceScatter_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z41ncclKernel_ReduceScatter_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 95 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_ReduceScatter_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z41ncclKernel_ReduceScatter_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_halfv 336 bytes stack frame, 348 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_NVLS_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_SIMPLE_Sum_halfv 384 bytes stack frame, 440 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL128_Sum_halfv 568 bytes stack frame, 432 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_RING_LL_Sum_halfv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_TREE_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z46ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z46ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z41ncclKernel_ReduceScatter_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z41ncclKernel_ReduceScatter_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z51ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z51ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z41ncclKernel_ReduceScatter_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z41ncclKernel_ReduceScatter_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 90 registers ptxas info : Compiling entry function '_Z41ncclKernel_ReduceScatter_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z41ncclKernel_ReduceScatter_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum_halfv 272 bytes stack frame, 288 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_NVLS_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_SIMPLE_Sum_halfv 280 bytes stack frame, 304 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL128_Sum_halfv 536 bytes stack frame, 400 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_RING_LL_Sum_halfv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_TREE_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_int8_tv 272 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Prod_int8_tv 280 bytes stack frame, 308 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Prod_int8_tv 496 bytes stack frame, 332 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Prod_int8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_int8_tv 272 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Prod_int8_tv 280 bytes stack frame, 308 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Prod_int8_tv 496 bytes stack frame, 332 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Prod_int8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_int8_tv 272 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Prod_int8_tv 280 bytes stack frame, 308 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Prod_int8_tv 496 bytes stack frame, 332 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Prod_int8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_int8_tv 288 bytes stack frame, 308 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Prod_int8_tv 336 bytes stack frame, 380 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Prod_int8_tv 568 bytes stack frame, 460 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Prod_int8_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_int8_tv 424 bytes stack frame, 492 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Prod_int8_tv 480 bytes stack frame, 668 bytes spill stores, 1248 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Prod_int8_tv 568 bytes stack frame, 440 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Prod_int8_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_int8_tv 416 bytes stack frame, 556 bytes spill stores, 956 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Prod_int8_tv 408 bytes stack frame, 524 bytes spill stores, 820 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Prod_int8_tv 536 bytes stack frame, 392 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Prod_int8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_uint8_tv 272 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Prod_uint8_tv 280 bytes stack frame, 308 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Prod_uint8_tv 496 bytes stack frame, 332 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Prod_uint8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_uint8_tv 272 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Prod_uint8_tv 280 bytes stack frame, 308 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Prod_uint8_tv 496 bytes stack frame, 332 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Prod_uint8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_uint8_tv 272 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Prod_uint8_tv 280 bytes stack frame, 308 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Prod_uint8_tv 496 bytes stack frame, 332 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Prod_uint8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_uint8_tv 288 bytes stack frame, 308 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Prod_uint8_tv 336 bytes stack frame, 380 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Prod_uint8_tv 568 bytes stack frame, 460 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Prod_uint8_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_uint8_tv 424 bytes stack frame, 492 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Prod_uint8_tv 480 bytes stack frame, 668 bytes spill stores, 1248 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Prod_uint8_tv 568 bytes stack frame, 440 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Prod_uint8_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_uint8_tv 416 bytes stack frame, 556 bytes spill stores, 956 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Prod_uint8_tv 408 bytes stack frame, 524 bytes spill stores, 820 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Prod_uint8_tv 536 bytes stack frame, 392 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Prod_uint8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_halfv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_NVLS_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_SIMPLE_Prod_halfv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL128_Prod_halfv 520 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_RING_LL_Prod_halfv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_TREE_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_halfv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_NVLS_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_SIMPLE_Prod_halfv 208 bytes stack frame, 224 bytes spill stores, 216 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL128_Prod_halfv 528 bytes stack frame, 396 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_RING_LL_Prod_halfv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_TREE_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_halfv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_NVLS_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_SIMPLE_Prod_halfv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL128_Prod_halfv 520 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_RING_LL_Prod_halfv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_TREE_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_halfv 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_NVLS_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_SIMPLE_Prod_halfv 288 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL128_Prod_halfv 576 bytes stack frame, 468 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_RING_LL_Prod_halfv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_TREE_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_halfv 336 bytes stack frame, 348 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_NVLS_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_SIMPLE_Prod_halfv 384 bytes stack frame, 440 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL128_Prod_halfv 568 bytes stack frame, 432 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_RING_LL_Prod_halfv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_TREE_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_halfv 272 bytes stack frame, 288 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_NVLS_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_SIMPLE_Prod_halfv 280 bytes stack frame, 304 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL128_Prod_halfv 536 bytes stack frame, 400 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_RING_LL_Prod_halfv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_TREE_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z55ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z55ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z59ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z59ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z60ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z60ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum___nv_bfloat16v 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_Sum___nv_bfloat16v 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_Sum___nv_bfloat16v 520 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_Sum___nv_bfloat16v 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z55ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z55ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z59ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z59ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z60ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z60ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 88 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum___nv_bfloat16v 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_Sum___nv_bfloat16v 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_Sum___nv_bfloat16v 520 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_Sum___nv_bfloat16v 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z55ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z55ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z59ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z59ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z60ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z60ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 87 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum___nv_bfloat16v 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_Sum___nv_bfloat16v 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_Sum___nv_bfloat16v 520 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_Sum___nv_bfloat16v 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z55ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z55ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z59ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z59ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z60ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z60ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 93 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum___nv_bfloat16v 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_Sum___nv_bfloat16v 264 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_Sum___nv_bfloat16v 560 bytes stack frame, 428 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_Sum___nv_bfloat16v 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z55ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z55ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z59ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z59ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z60ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z60ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 95 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum___nv_bfloat16v 336 bytes stack frame, 348 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_Sum___nv_bfloat16v 384 bytes stack frame, 440 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_Sum___nv_bfloat16v 568 bytes stack frame, 432 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_Sum___nv_bfloat16v 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z55ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z55ncclKernel_ReduceScatter_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z59ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z59ncclKernel_ReduceScatter_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z60ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z60ncclKernel_ReduceScatter_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 90 registers ptxas info : Compiling entry function '_Z50ncclKernel_ReduceScatter_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z50ncclKernel_ReduceScatter_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_Sum___nv_bfloat16v 272 bytes stack frame, 288 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_Sum___nv_bfloat16v 280 bytes stack frame, 304 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_Sum___nv_bfloat16v 536 bytes stack frame, 400 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_Sum___nv_bfloat16v 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_doublev 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_doublev 528 bytes stack frame, 1040 bytes spill stores, 1456 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Min_doublev 248 bytes stack frame, 272 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Min_doublev 496 bytes stack frame, 348 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Min_doublev 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Min_doublev 328 bytes stack frame, 436 bytes spill stores, 668 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Min_doublev 504 bytes stack frame, 348 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Min_doublev 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_doublev 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_doublev 520 bytes stack frame, 1024 bytes spill stores, 1416 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Min_doublev 232 bytes stack frame, 252 bytes spill stores, 252 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Min_doublev 496 bytes stack frame, 348 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Min_doublev 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Min_doublev 320 bytes stack frame, 396 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Min_doublev 496 bytes stack frame, 336 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Min_doublev 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_doublev 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_doublev 520 bytes stack frame, 1024 bytes spill stores, 1416 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Min_doublev 232 bytes stack frame, 252 bytes spill stores, 252 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Min_doublev 496 bytes stack frame, 348 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Min_doublev 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Min_doublev 320 bytes stack frame, 396 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Min_doublev 496 bytes stack frame, 336 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Min_doublev 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_doublev 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_doublev 536 bytes stack frame, 1056 bytes spill stores, 1484 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Min_doublev 288 bytes stack frame, 312 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Min_doublev 568 bytes stack frame, 444 bytes spill stores, 596 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Min_doublev 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Min_doublev 320 bytes stack frame, 444 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Min_doublev 568 bytes stack frame, 408 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Min_doublev 360 bytes stack frame, 560 bytes spill stores, 764 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_doublev 296 bytes stack frame, 324 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_doublev 672 bytes stack frame, 1436 bytes spill stores, 2212 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Min_doublev 408 bytes stack frame, 504 bytes spill stores, 1228 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Min_doublev 584 bytes stack frame, 492 bytes spill stores, 640 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Min_doublev 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Min_doublev 472 bytes stack frame, 948 bytes spill stores, 1544 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Min_doublev 568 bytes stack frame, 408 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Min_doublev 368 bytes stack frame, 580 bytes spill stores, 788 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_doublev 240 bytes stack frame, 268 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_doublev 664 bytes stack frame, 1392 bytes spill stores, 2188 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Min_doublev 304 bytes stack frame, 344 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Min_doublev 520 bytes stack frame, 400 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Min_doublev 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Min_doublev 440 bytes stack frame, 1000 bytes spill stores, 1468 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Min_doublev 488 bytes stack frame, 304 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Min_doublev 352 bytes stack frame, 528 bytes spill stores, 788 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_doublev 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_doublev 520 bytes stack frame, 1024 bytes spill stores, 1416 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Prod_doublev 232 bytes stack frame, 252 bytes spill stores, 252 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Prod_doublev 496 bytes stack frame, 348 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Prod_doublev 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Prod_doublev 320 bytes stack frame, 396 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Prod_doublev 496 bytes stack frame, 336 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Prod_doublev 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_doublev 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_doublev 520 bytes stack frame, 1024 bytes spill stores, 1416 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Prod_doublev 232 bytes stack frame, 252 bytes spill stores, 252 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Prod_doublev 496 bytes stack frame, 348 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Prod_doublev 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Prod_doublev 320 bytes stack frame, 396 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Prod_doublev 496 bytes stack frame, 336 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Prod_doublev 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_doublev 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_doublev 520 bytes stack frame, 1024 bytes spill stores, 1416 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Prod_doublev 232 bytes stack frame, 252 bytes spill stores, 252 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Prod_doublev 496 bytes stack frame, 348 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Prod_doublev 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Prod_doublev 320 bytes stack frame, 396 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Prod_doublev 496 bytes stack frame, 336 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Prod_doublev 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_doublev 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_doublev 552 bytes stack frame, 1096 bytes spill stores, 1632 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Prod_doublev 288 bytes stack frame, 312 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Prod_doublev 560 bytes stack frame, 436 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Prod_doublev 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Prod_doublev 344 bytes stack frame, 484 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Prod_doublev 568 bytes stack frame, 408 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Prod_doublev 360 bytes stack frame, 560 bytes spill stores, 764 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_doublev 240 bytes stack frame, 260 bytes spill stores, 252 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_doublev 696 bytes stack frame, 1556 bytes spill stores, 2396 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Prod_doublev 360 bytes stack frame, 396 bytes spill stores, 788 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Prod_doublev 544 bytes stack frame, 396 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Prod_doublev 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Prod_doublev 472 bytes stack frame, 960 bytes spill stores, 1496 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Prod_doublev 568 bytes stack frame, 408 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Prod_doublev 368 bytes stack frame, 580 bytes spill stores, 788 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_doublev 224 bytes stack frame, 244 bytes spill stores, 252 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_doublev 688 bytes stack frame, 1540 bytes spill stores, 2392 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Prod_doublev 280 bytes stack frame, 296 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Prod_doublev 488 bytes stack frame, 344 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Prod_doublev 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Prod_doublev 432 bytes stack frame, 864 bytes spill stores, 1212 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Prod_doublev 480 bytes stack frame, 296 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Prod_doublev 352 bytes stack frame, 528 bytes spill stores, 788 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_doublev 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_doublev 528 bytes stack frame, 1064 bytes spill stores, 1496 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_doublev 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_RING_LL128_PreMulSum_doublev 488 bytes stack frame, 336 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_LL_PreMulSum_doublev 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_doublev 320 bytes stack frame, 396 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_TREE_LL128_PreMulSum_doublev 504 bytes stack frame, 340 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_LL_PreMulSum_doublev 336 bytes stack frame, 568 bytes spill stores, 704 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_doublev 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_doublev 528 bytes stack frame, 1064 bytes spill stores, 1496 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_doublev 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_RING_LL128_PreMulSum_doublev 480 bytes stack frame, 316 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_LL_PreMulSum_doublev 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_doublev 320 bytes stack frame, 396 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_TREE_LL128_PreMulSum_doublev 504 bytes stack frame, 340 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_LL_PreMulSum_doublev 336 bytes stack frame, 568 bytes spill stores, 704 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_doublev 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_doublev 528 bytes stack frame, 1064 bytes spill stores, 1496 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_doublev 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_RING_LL128_PreMulSum_doublev 480 bytes stack frame, 316 bytes spill stores, 372 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_LL_PreMulSum_doublev 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_doublev 320 bytes stack frame, 396 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_TREE_LL128_PreMulSum_doublev 504 bytes stack frame, 340 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_LL_PreMulSum_doublev 336 bytes stack frame, 568 bytes spill stores, 704 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_doublev 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_doublev 568 bytes stack frame, 1140 bytes spill stores, 1820 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_doublev 288 bytes stack frame, 316 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_RING_LL128_PreMulSum_doublev 552 bytes stack frame, 400 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_LL_PreMulSum_doublev 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_doublev 344 bytes stack frame, 484 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_TREE_LL128_PreMulSum_doublev 576 bytes stack frame, 412 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_LL_PreMulSum_doublev 368 bytes stack frame, 564 bytes spill stores, 784 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_doublev 240 bytes stack frame, 260 bytes spill stores, 252 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_doublev 696 bytes stack frame, 1532 bytes spill stores, 2344 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_doublev 368 bytes stack frame, 440 bytes spill stores, 820 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_RING_LL128_PreMulSum_doublev 560 bytes stack frame, 404 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_LL_PreMulSum_doublev 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_doublev 456 bytes stack frame, 932 bytes spill stores, 1460 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_TREE_LL128_PreMulSum_doublev 576 bytes stack frame, 412 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_LL_PreMulSum_doublev 376 bytes stack frame, 588 bytes spill stores, 808 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_doublev 224 bytes stack frame, 244 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_doublev 696 bytes stack frame, 1508 bytes spill stores, 2396 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_doublev 288 bytes stack frame, 320 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_RING_LL128_PreMulSum_doublev 488 bytes stack frame, 336 bytes spill stores, 464 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_LL_PreMulSum_doublev 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_doublev 440 bytes stack frame, 904 bytes spill stores, 1268 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_TREE_LL128_PreMulSum_doublev 488 bytes stack frame, 296 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_LL_PreMulSum_doublev 360 bytes stack frame, 532 bytes spill stores, 808 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_doublev 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Prod_doublev 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Prod_doublev 520 bytes stack frame, 380 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Prod_doublev 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_doublev 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Prod_doublev 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Prod_doublev 520 bytes stack frame, 380 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Prod_doublev 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_doublev 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Prod_doublev 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Prod_doublev 520 bytes stack frame, 380 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Prod_doublev 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_doublev 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Prod_doublev 288 bytes stack frame, 316 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Prod_doublev 560 bytes stack frame, 436 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Prod_doublev 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_doublev 336 bytes stack frame, 348 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Prod_doublev 352 bytes stack frame, 384 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Prod_doublev 560 bytes stack frame, 428 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Prod_doublev 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_doublev 288 bytes stack frame, 300 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Prod_doublev 264 bytes stack frame, 284 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Prod_doublev 520 bytes stack frame, 384 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Prod_doublev 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Prod_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_floatv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Prod_floatv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Prod_floatv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Prod_floatv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_floatv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Prod_floatv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Prod_floatv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Prod_floatv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_floatv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Prod_floatv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Prod_floatv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Prod_floatv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_floatv 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Prod_floatv 304 bytes stack frame, 332 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Prod_floatv 552 bytes stack frame, 420 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Prod_floatv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_floatv 336 bytes stack frame, 348 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Prod_floatv 408 bytes stack frame, 444 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Prod_floatv 560 bytes stack frame, 432 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Prod_floatv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod_floatv 288 bytes stack frame, 300 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Prod_floatv 312 bytes stack frame, 340 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Prod_floatv 528 bytes stack frame, 400 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Prod_floatv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_int64_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_int64_tv 496 bytes stack frame, 988 bytes spill stores, 1328 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Prod_int64_tv 248 bytes stack frame, 272 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Prod_int64_tv 520 bytes stack frame, 384 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Prod_int64_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Prod_int64_tv 288 bytes stack frame, 360 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Prod_int64_tv 504 bytes stack frame, 348 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Prod_int64_tv 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_int64_tv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_int64_tv 496 bytes stack frame, 988 bytes spill stores, 1328 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Prod_int64_tv 248 bytes stack frame, 272 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Prod_int64_tv 520 bytes stack frame, 384 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Prod_int64_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Prod_int64_tv 288 bytes stack frame, 360 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Prod_int64_tv 504 bytes stack frame, 348 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Prod_int64_tv 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_int64_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_int64_tv 496 bytes stack frame, 988 bytes spill stores, 1328 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Prod_int64_tv 248 bytes stack frame, 272 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Prod_int64_tv 520 bytes stack frame, 384 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Prod_int64_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Prod_int64_tv 288 bytes stack frame, 360 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Prod_int64_tv 504 bytes stack frame, 348 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Prod_int64_tv 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_int64_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_int64_tv 600 bytes stack frame, 1296 bytes spill stores, 2052 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Prod_int64_tv 304 bytes stack frame, 328 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Prod_int64_tv 560 bytes stack frame, 436 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Prod_int64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Prod_int64_tv 312 bytes stack frame, 428 bytes spill stores, 580 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Prod_int64_tv 544 bytes stack frame, 380 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Prod_int64_tv 360 bytes stack frame, 560 bytes spill stores, 764 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_int64_tv 296 bytes stack frame, 324 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_int64_tv 680 bytes stack frame, 1508 bytes spill stores, 2224 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Prod_int64_tv 400 bytes stack frame, 464 bytes spill stores, 1116 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Prod_int64_tv 552 bytes stack frame, 408 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Prod_int64_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Prod_int64_tv 472 bytes stack frame, 988 bytes spill stores, 1424 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Prod_int64_tv 544 bytes stack frame, 380 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Prod_int64_tv 368 bytes stack frame, 572 bytes spill stores, 788 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_int64_tv 240 bytes stack frame, 268 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_int64_tv 664 bytes stack frame, 1476 bytes spill stores, 2196 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Prod_int64_tv 304 bytes stack frame, 360 bytes spill stores, 668 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Prod_int64_tv 496 bytes stack frame, 352 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Prod_int64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Prod_int64_tv 416 bytes stack frame, 960 bytes spill stores, 1248 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Prod_int64_tv 504 bytes stack frame, 324 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Prod_int64_tv 352 bytes stack frame, 528 bytes spill stores, 788 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_doublev 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_doublev 528 bytes stack frame, 1040 bytes spill stores, 1456 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Max_doublev 248 bytes stack frame, 272 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Max_doublev 496 bytes stack frame, 348 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Max_doublev 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Max_doublev 328 bytes stack frame, 436 bytes spill stores, 668 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Max_doublev 504 bytes stack frame, 348 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Max_doublev 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_doublev 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_doublev 520 bytes stack frame, 1024 bytes spill stores, 1416 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Max_doublev 232 bytes stack frame, 252 bytes spill stores, 252 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Max_doublev 496 bytes stack frame, 348 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Max_doublev 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Max_doublev 320 bytes stack frame, 396 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Max_doublev 496 bytes stack frame, 336 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Max_doublev 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_doublev 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_doublev 520 bytes stack frame, 1024 bytes spill stores, 1416 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Max_doublev 232 bytes stack frame, 252 bytes spill stores, 252 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Max_doublev 496 bytes stack frame, 348 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Max_doublev 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Max_doublev 320 bytes stack frame, 396 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Max_doublev 496 bytes stack frame, 336 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Max_doublev 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_doublev 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_doublev 536 bytes stack frame, 1056 bytes spill stores, 1484 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Max_doublev 288 bytes stack frame, 312 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Max_doublev 568 bytes stack frame, 444 bytes spill stores, 596 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Max_doublev 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Max_doublev 320 bytes stack frame, 444 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Max_doublev 568 bytes stack frame, 408 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Max_doublev 360 bytes stack frame, 560 bytes spill stores, 764 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_doublev 296 bytes stack frame, 324 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_doublev 672 bytes stack frame, 1436 bytes spill stores, 2212 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Max_doublev 408 bytes stack frame, 504 bytes spill stores, 1228 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Max_doublev 584 bytes stack frame, 492 bytes spill stores, 640 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Max_doublev 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Max_doublev 472 bytes stack frame, 948 bytes spill stores, 1544 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Max_doublev 568 bytes stack frame, 408 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Max_doublev 368 bytes stack frame, 580 bytes spill stores, 788 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_doublev 240 bytes stack frame, 268 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_doublev 664 bytes stack frame, 1392 bytes spill stores, 2188 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Max_doublev 304 bytes stack frame, 344 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Max_doublev 520 bytes stack frame, 400 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Max_doublev 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Max_doublev 440 bytes stack frame, 1000 bytes spill stores, 1468 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Max_doublev 488 bytes stack frame, 304 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Max_doublev 352 bytes stack frame, 528 bytes spill stores, 788 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_floatv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_floatv 496 bytes stack frame, 1004 bytes spill stores, 1320 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Prod_floatv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Prod_floatv 496 bytes stack frame, 360 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Prod_floatv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Prod_floatv 336 bytes stack frame, 488 bytes spill stores, 688 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Prod_floatv 504 bytes stack frame, 348 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Prod_floatv 344 bytes stack frame, 592 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_floatv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_floatv 496 bytes stack frame, 1004 bytes spill stores, 1320 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Prod_floatv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Prod_floatv 496 bytes stack frame, 360 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Prod_floatv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Prod_floatv 336 bytes stack frame, 488 bytes spill stores, 688 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Prod_floatv 504 bytes stack frame, 348 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Prod_floatv 344 bytes stack frame, 592 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_floatv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_floatv 496 bytes stack frame, 1004 bytes spill stores, 1320 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Prod_floatv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Prod_floatv 496 bytes stack frame, 360 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Prod_floatv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Prod_floatv 336 bytes stack frame, 488 bytes spill stores, 688 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Prod_floatv 504 bytes stack frame, 348 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Prod_floatv 344 bytes stack frame, 592 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_floatv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_floatv 528 bytes stack frame, 1048 bytes spill stores, 1412 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Prod_floatv 288 bytes stack frame, 316 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Prod_floatv 552 bytes stack frame, 412 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Prod_floatv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Prod_floatv 344 bytes stack frame, 496 bytes spill stores, 668 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Prod_floatv 552 bytes stack frame, 400 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Prod_floatv 368 bytes stack frame, 592 bytes spill stores, 832 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_floatv 288 bytes stack frame, 312 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_floatv 568 bytes stack frame, 1204 bytes spill stores, 1812 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Prod_floatv 400 bytes stack frame, 468 bytes spill stores, 1164 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Prod_floatv 560 bytes stack frame, 432 bytes spill stores, 532 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Prod_floatv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Prod_floatv 480 bytes stack frame, 948 bytes spill stores, 1412 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Prod_floatv 560 bytes stack frame, 400 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Prod_floatv 376 bytes stack frame, 604 bytes spill stores, 856 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_floatv 240 bytes stack frame, 268 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_floatv 560 bytes stack frame, 1248 bytes spill stores, 1988 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Prod_floatv 304 bytes stack frame, 352 bytes spill stores, 640 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Prod_floatv 504 bytes stack frame, 356 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Prod_floatv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Prod_floatv 416 bytes stack frame, 828 bytes spill stores, 1196 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Prod_floatv 488 bytes stack frame, 308 bytes spill stores, 276 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Prod_floatv 360 bytes stack frame, 572 bytes spill stores, 820 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_uint64_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_uint64_tv 496 bytes stack frame, 988 bytes spill stores, 1328 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_SIMPLE_Prod_uint64_tv 248 bytes stack frame, 272 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_LL128_Prod_uint64_tv 520 bytes stack frame, 384 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL_Prod_uint64_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_SIMPLE_Prod_uint64_tv 288 bytes stack frame, 360 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_LL128_Prod_uint64_tv 504 bytes stack frame, 348 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL_Prod_uint64_tv 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_uint64_tv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_uint64_tv 496 bytes stack frame, 988 bytes spill stores, 1328 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_SIMPLE_Prod_uint64_tv 248 bytes stack frame, 272 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_LL128_Prod_uint64_tv 520 bytes stack frame, 384 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL_Prod_uint64_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_SIMPLE_Prod_uint64_tv 288 bytes stack frame, 360 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_LL128_Prod_uint64_tv 504 bytes stack frame, 348 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL_Prod_uint64_tv 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_uint64_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_uint64_tv 496 bytes stack frame, 988 bytes spill stores, 1328 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_SIMPLE_Prod_uint64_tv 248 bytes stack frame, 272 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_LL128_Prod_uint64_tv 520 bytes stack frame, 384 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL_Prod_uint64_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_SIMPLE_Prod_uint64_tv 288 bytes stack frame, 360 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_LL128_Prod_uint64_tv 504 bytes stack frame, 348 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL_Prod_uint64_tv 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_uint64_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_uint64_tv 600 bytes stack frame, 1296 bytes spill stores, 2052 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_SIMPLE_Prod_uint64_tv 304 bytes stack frame, 328 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_LL128_Prod_uint64_tv 560 bytes stack frame, 436 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL_Prod_uint64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_SIMPLE_Prod_uint64_tv 312 bytes stack frame, 428 bytes spill stores, 580 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_LL128_Prod_uint64_tv 544 bytes stack frame, 380 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL_Prod_uint64_tv 360 bytes stack frame, 560 bytes spill stores, 764 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_uint64_tv 296 bytes stack frame, 324 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_uint64_tv 680 bytes stack frame, 1508 bytes spill stores, 2224 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_SIMPLE_Prod_uint64_tv 400 bytes stack frame, 464 bytes spill stores, 1116 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_LL128_Prod_uint64_tv 552 bytes stack frame, 408 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL_Prod_uint64_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_SIMPLE_Prod_uint64_tv 472 bytes stack frame, 988 bytes spill stores, 1424 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_LL128_Prod_uint64_tv 544 bytes stack frame, 380 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL_Prod_uint64_tv 368 bytes stack frame, 572 bytes spill stores, 788 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_SIMPLE_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_uint64_tv 240 bytes stack frame, 268 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_uint64_tv 664 bytes stack frame, 1476 bytes spill stores, 2196 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_SIMPLE_Prod_uint64_tv 304 bytes stack frame, 360 bytes spill stores, 668 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_LL128_Prod_uint64_tv 496 bytes stack frame, 352 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL_Prod_uint64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_SIMPLE_Prod_uint64_tv 416 bytes stack frame, 960 bytes spill stores, 1248 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_LL128_Prod_uint64_tv 504 bytes stack frame, 324 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL_Prod_uint64_tv 352 bytes stack frame, 528 bytes spill stores, 788 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int64_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int64_tv 560 bytes stack frame, 1184 bytes spill stores, 1664 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_int64_tv 256 bytes stack frame, 276 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_PreMulSum_int64_tv 504 bytes stack frame, 360 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_PreMulSum_int64_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_int64_tv 296 bytes stack frame, 396 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_PreMulSum_int64_tv 512 bytes stack frame, 352 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_PreMulSum_int64_tv 336 bytes stack frame, 568 bytes spill stores, 704 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int64_tv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int64_tv 560 bytes stack frame, 1184 bytes spill stores, 1664 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_int64_tv 256 bytes stack frame, 276 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_PreMulSum_int64_tv 504 bytes stack frame, 360 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_PreMulSum_int64_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_int64_tv 296 bytes stack frame, 396 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_PreMulSum_int64_tv 504 bytes stack frame, 348 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_PreMulSum_int64_tv 336 bytes stack frame, 568 bytes spill stores, 704 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int64_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int64_tv 560 bytes stack frame, 1184 bytes spill stores, 1664 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_int64_tv 256 bytes stack frame, 276 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_PreMulSum_int64_tv 504 bytes stack frame, 360 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_PreMulSum_int64_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_int64_tv 296 bytes stack frame, 396 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_PreMulSum_int64_tv 504 bytes stack frame, 348 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_PreMulSum_int64_tv 336 bytes stack frame, 568 bytes spill stores, 704 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int64_tv 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int64_tv 576 bytes stack frame, 1212 bytes spill stores, 1792 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_int64_tv 304 bytes stack frame, 328 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_PreMulSum_int64_tv 560 bytes stack frame, 408 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_PreMulSum_int64_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_int64_tv 304 bytes stack frame, 416 bytes spill stores, 580 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_PreMulSum_int64_tv 552 bytes stack frame, 384 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_PreMulSum_int64_tv 368 bytes stack frame, 564 bytes spill stores, 784 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int64_tv 304 bytes stack frame, 336 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int64_tv 704 bytes stack frame, 1568 bytes spill stores, 2452 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_int64_tv 424 bytes stack frame, 512 bytes spill stores, 1316 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_PreMulSum_int64_tv 560 bytes stack frame, 404 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_PreMulSum_int64_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_int64_tv 496 bytes stack frame, 1016 bytes spill stores, 1524 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_PreMulSum_int64_tv 560 bytes stack frame, 384 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_PreMulSum_int64_tv 376 bytes stack frame, 588 bytes spill stores, 808 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int64_tv 264 bytes stack frame, 300 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int64_tv 688 bytes stack frame, 1452 bytes spill stores, 2288 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_int64_tv 320 bytes stack frame, 380 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_PreMulSum_int64_tv 496 bytes stack frame, 340 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_PreMulSum_int64_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_int64_tv 440 bytes stack frame, 988 bytes spill stores, 1352 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_PreMulSum_int64_tv 496 bytes stack frame, 308 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_PreMulSum_int64_tv 360 bytes stack frame, 532 bytes spill stores, 808 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_TREE_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_NVLS_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_floatv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_floatv 496 bytes stack frame, 1004 bytes spill stores, 1320 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_SIMPLE_Min_floatv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL128_Min_floatv 496 bytes stack frame, 360 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_RING_LL_Min_floatv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_SIMPLE_Min_floatv 336 bytes stack frame, 488 bytes spill stores, 688 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL128_Min_floatv 504 bytes stack frame, 348 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_TREE_LL_Min_floatv 344 bytes stack frame, 592 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_TREE_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_NVLS_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_floatv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_floatv 496 bytes stack frame, 1004 bytes spill stores, 1320 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_SIMPLE_Min_floatv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL128_Min_floatv 496 bytes stack frame, 360 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_RING_LL_Min_floatv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_SIMPLE_Min_floatv 336 bytes stack frame, 488 bytes spill stores, 688 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL128_Min_floatv 504 bytes stack frame, 348 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_TREE_LL_Min_floatv 344 bytes stack frame, 592 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_TREE_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_NVLS_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_floatv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_floatv 496 bytes stack frame, 1004 bytes spill stores, 1320 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_SIMPLE_Min_floatv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL128_Min_floatv 496 bytes stack frame, 360 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_RING_LL_Min_floatv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_SIMPLE_Min_floatv 336 bytes stack frame, 488 bytes spill stores, 688 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL128_Min_floatv 504 bytes stack frame, 348 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_TREE_LL_Min_floatv 344 bytes stack frame, 592 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_TREE_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_NVLS_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_floatv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_floatv 528 bytes stack frame, 1048 bytes spill stores, 1412 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_SIMPLE_Min_floatv 288 bytes stack frame, 316 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL128_Min_floatv 552 bytes stack frame, 412 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_RING_LL_Min_floatv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_SIMPLE_Min_floatv 344 bytes stack frame, 496 bytes spill stores, 668 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL128_Min_floatv 552 bytes stack frame, 400 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_TREE_LL_Min_floatv 368 bytes stack frame, 592 bytes spill stores, 832 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_TREE_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_NVLS_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_floatv 288 bytes stack frame, 312 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_floatv 568 bytes stack frame, 1204 bytes spill stores, 1812 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_SIMPLE_Min_floatv 400 bytes stack frame, 468 bytes spill stores, 1164 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL128_Min_floatv 560 bytes stack frame, 416 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_RING_LL_Min_floatv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_SIMPLE_Min_floatv 480 bytes stack frame, 948 bytes spill stores, 1412 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL128_Min_floatv 560 bytes stack frame, 400 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_TREE_LL_Min_floatv 376 bytes stack frame, 604 bytes spill stores, 856 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_TREE_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_NVLS_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_floatv 240 bytes stack frame, 268 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_floatv 560 bytes stack frame, 1248 bytes spill stores, 1988 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_SIMPLE_Min_floatv 304 bytes stack frame, 352 bytes spill stores, 640 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL128_Min_floatv 504 bytes stack frame, 356 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_RING_LL_Min_floatv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_SIMPLE_Min_floatv 416 bytes stack frame, 828 bytes spill stores, 1196 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL128_Min_floatv 488 bytes stack frame, 308 bytes spill stores, 276 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_TREE_LL_Min_floatv 360 bytes stack frame, 572 bytes spill stores, 820 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_int32_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_int32_tv 480 bytes stack frame, 984 bytes spill stores, 1200 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Prod_int32_tv 240 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Prod_int32_tv 520 bytes stack frame, 384 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Prod_int32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Prod_int32_tv 320 bytes stack frame, 464 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Prod_int32_tv 488 bytes stack frame, 328 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Prod_int32_tv 344 bytes stack frame, 584 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_int32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_int32_tv 480 bytes stack frame, 984 bytes spill stores, 1200 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Prod_int32_tv 240 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Prod_int32_tv 520 bytes stack frame, 384 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Prod_int32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Prod_int32_tv 320 bytes stack frame, 464 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Prod_int32_tv 488 bytes stack frame, 328 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Prod_int32_tv 344 bytes stack frame, 584 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_int32_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_int32_tv 480 bytes stack frame, 984 bytes spill stores, 1200 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Prod_int32_tv 240 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Prod_int32_tv 520 bytes stack frame, 384 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Prod_int32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Prod_int32_tv 320 bytes stack frame, 464 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Prod_int32_tv 488 bytes stack frame, 328 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Prod_int32_tv 344 bytes stack frame, 584 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_int32_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_int32_tv 536 bytes stack frame, 1148 bytes spill stores, 1496 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Prod_int32_tv 288 bytes stack frame, 316 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Prod_int32_tv 552 bytes stack frame, 412 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Prod_int32_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Prod_int32_tv 352 bytes stack frame, 532 bytes spill stores, 756 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Prod_int32_tv 552 bytes stack frame, 400 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Prod_int32_tv 368 bytes stack frame, 592 bytes spill stores, 840 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_int32_tv 288 bytes stack frame, 312 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_int32_tv 568 bytes stack frame, 1176 bytes spill stores, 1892 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Prod_int32_tv 400 bytes stack frame, 472 bytes spill stores, 1112 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Prod_int32_tv 560 bytes stack frame, 416 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Prod_int32_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Prod_int32_tv 496 bytes stack frame, 936 bytes spill stores, 1412 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Prod_int32_tv 560 bytes stack frame, 400 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Prod_int32_tv 376 bytes stack frame, 596 bytes spill stores, 864 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_int32_tv 240 bytes stack frame, 268 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_int32_tv 560 bytes stack frame, 1248 bytes spill stores, 2012 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Prod_int32_tv 304 bytes stack frame, 336 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Prod_int32_tv 504 bytes stack frame, 356 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Prod_int32_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Prod_int32_tv 416 bytes stack frame, 828 bytes spill stores, 1188 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Prod_int32_tv 480 bytes stack frame, 288 bytes spill stores, 272 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Prod_int32_tv 360 bytes stack frame, 580 bytes spill stores, 828 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_TREE_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_NVLS_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_floatv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_floatv 496 bytes stack frame, 1004 bytes spill stores, 1320 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_SIMPLE_Max_floatv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL128_Max_floatv 496 bytes stack frame, 360 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_RING_LL_Max_floatv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_SIMPLE_Max_floatv 336 bytes stack frame, 488 bytes spill stores, 688 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL128_Max_floatv 504 bytes stack frame, 348 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_TREE_LL_Max_floatv 344 bytes stack frame, 592 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_TREE_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_NVLS_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_floatv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_floatv 496 bytes stack frame, 1004 bytes spill stores, 1320 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_SIMPLE_Max_floatv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL128_Max_floatv 496 bytes stack frame, 360 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_RING_LL_Max_floatv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_SIMPLE_Max_floatv 336 bytes stack frame, 488 bytes spill stores, 688 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL128_Max_floatv 504 bytes stack frame, 348 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_TREE_LL_Max_floatv 344 bytes stack frame, 592 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_TREE_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_NVLS_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_floatv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_floatv 496 bytes stack frame, 1004 bytes spill stores, 1320 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_SIMPLE_Max_floatv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL128_Max_floatv 496 bytes stack frame, 360 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_RING_LL_Max_floatv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_SIMPLE_Max_floatv 336 bytes stack frame, 488 bytes spill stores, 688 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL128_Max_floatv 504 bytes stack frame, 348 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_TREE_LL_Max_floatv 344 bytes stack frame, 592 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_TREE_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_NVLS_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_floatv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_floatv 528 bytes stack frame, 1048 bytes spill stores, 1412 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_SIMPLE_Max_floatv 288 bytes stack frame, 316 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL128_Max_floatv 552 bytes stack frame, 412 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_RING_LL_Max_floatv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_SIMPLE_Max_floatv 344 bytes stack frame, 496 bytes spill stores, 668 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL128_Max_floatv 552 bytes stack frame, 400 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_TREE_LL_Max_floatv 368 bytes stack frame, 592 bytes spill stores, 832 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_TREE_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_NVLS_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_floatv 288 bytes stack frame, 312 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_floatv 568 bytes stack frame, 1204 bytes spill stores, 1812 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_SIMPLE_Max_floatv 400 bytes stack frame, 468 bytes spill stores, 1164 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL128_Max_floatv 560 bytes stack frame, 416 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_RING_LL_Max_floatv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_SIMPLE_Max_floatv 480 bytes stack frame, 948 bytes spill stores, 1412 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL128_Max_floatv 560 bytes stack frame, 400 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_TREE_LL_Max_floatv 376 bytes stack frame, 604 bytes spill stores, 856 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_TREE_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_NVLS_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_floatv 240 bytes stack frame, 268 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_floatv 560 bytes stack frame, 1248 bytes spill stores, 1988 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_SIMPLE_Max_floatv 304 bytes stack frame, 352 bytes spill stores, 640 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL128_Max_floatv 504 bytes stack frame, 356 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_RING_LL_Max_floatv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_SIMPLE_Max_floatv 416 bytes stack frame, 828 bytes spill stores, 1196 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL128_Max_floatv 488 bytes stack frame, 308 bytes spill stores, 276 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_TREE_LL_Max_floatv 360 bytes stack frame, 572 bytes spill stores, 820 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_uint32_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_uint32_tv 480 bytes stack frame, 984 bytes spill stores, 1200 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_SIMPLE_Prod_uint32_tv 240 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_LL128_Prod_uint32_tv 520 bytes stack frame, 384 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL_Prod_uint32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_SIMPLE_Prod_uint32_tv 320 bytes stack frame, 464 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_LL128_Prod_uint32_tv 488 bytes stack frame, 328 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL_Prod_uint32_tv 344 bytes stack frame, 584 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_uint32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_uint32_tv 480 bytes stack frame, 984 bytes spill stores, 1200 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_SIMPLE_Prod_uint32_tv 240 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_LL128_Prod_uint32_tv 520 bytes stack frame, 384 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL_Prod_uint32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_SIMPLE_Prod_uint32_tv 320 bytes stack frame, 464 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_LL128_Prod_uint32_tv 488 bytes stack frame, 328 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL_Prod_uint32_tv 344 bytes stack frame, 584 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_uint32_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_uint32_tv 480 bytes stack frame, 984 bytes spill stores, 1200 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_SIMPLE_Prod_uint32_tv 240 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_LL128_Prod_uint32_tv 520 bytes stack frame, 384 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL_Prod_uint32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_SIMPLE_Prod_uint32_tv 320 bytes stack frame, 464 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_LL128_Prod_uint32_tv 488 bytes stack frame, 328 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL_Prod_uint32_tv 344 bytes stack frame, 584 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_uint32_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_uint32_tv 536 bytes stack frame, 1148 bytes spill stores, 1496 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_SIMPLE_Prod_uint32_tv 288 bytes stack frame, 316 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_LL128_Prod_uint32_tv 552 bytes stack frame, 412 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL_Prod_uint32_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_SIMPLE_Prod_uint32_tv 352 bytes stack frame, 532 bytes spill stores, 756 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_LL128_Prod_uint32_tv 552 bytes stack frame, 400 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL_Prod_uint32_tv 368 bytes stack frame, 592 bytes spill stores, 840 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_uint32_tv 288 bytes stack frame, 312 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_uint32_tv 568 bytes stack frame, 1176 bytes spill stores, 1892 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_SIMPLE_Prod_uint32_tv 400 bytes stack frame, 472 bytes spill stores, 1112 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_LL128_Prod_uint32_tv 560 bytes stack frame, 416 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL_Prod_uint32_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_SIMPLE_Prod_uint32_tv 496 bytes stack frame, 936 bytes spill stores, 1412 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_LL128_Prod_uint32_tv 560 bytes stack frame, 400 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL_Prod_uint32_tv 376 bytes stack frame, 596 bytes spill stores, 864 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_SIMPLE_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_uint32_tv 240 bytes stack frame, 268 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_uint32_tv 560 bytes stack frame, 1248 bytes spill stores, 2012 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_SIMPLE_Prod_uint32_tv 304 bytes stack frame, 336 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_LL128_Prod_uint32_tv 504 bytes stack frame, 356 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL_Prod_uint32_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_SIMPLE_Prod_uint32_tv 416 bytes stack frame, 828 bytes spill stores, 1188 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_LL128_Prod_uint32_tv 480 bytes stack frame, 288 bytes spill stores, 272 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL_Prod_uint32_tv 360 bytes stack frame, 580 bytes spill stores, 828 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_int64_tv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_int64_tv 496 bytes stack frame, 984 bytes spill stores, 1328 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Max_int64_tv 248 bytes stack frame, 272 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Max_int64_tv 520 bytes stack frame, 384 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Max_int64_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Max_int64_tv 288 bytes stack frame, 360 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Max_int64_tv 504 bytes stack frame, 348 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Max_int64_tv 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_int64_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_int64_tv 496 bytes stack frame, 984 bytes spill stores, 1328 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Max_int64_tv 248 bytes stack frame, 272 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Max_int64_tv 520 bytes stack frame, 384 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Max_int64_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Max_int64_tv 288 bytes stack frame, 360 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Max_int64_tv 504 bytes stack frame, 348 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Max_int64_tv 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_int64_tv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_int64_tv 496 bytes stack frame, 984 bytes spill stores, 1328 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Max_int64_tv 248 bytes stack frame, 272 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Max_int64_tv 520 bytes stack frame, 384 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Max_int64_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Max_int64_tv 288 bytes stack frame, 360 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Max_int64_tv 504 bytes stack frame, 348 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Max_int64_tv 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_int64_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_int64_tv 544 bytes stack frame, 1072 bytes spill stores, 1500 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Max_int64_tv 296 bytes stack frame, 320 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Max_int64_tv 560 bytes stack frame, 436 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Max_int64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Max_int64_tv 304 bytes stack frame, 416 bytes spill stores, 580 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Max_int64_tv 544 bytes stack frame, 380 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Max_int64_tv 360 bytes stack frame, 568 bytes spill stores, 764 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_int64_tv 296 bytes stack frame, 324 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_int64_tv 640 bytes stack frame, 1408 bytes spill stores, 2132 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Max_int64_tv 400 bytes stack frame, 484 bytes spill stores, 1072 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Max_int64_tv 568 bytes stack frame, 448 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Max_int64_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Max_int64_tv 416 bytes stack frame, 820 bytes spill stores, 1256 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Max_int64_tv 544 bytes stack frame, 380 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Max_int64_tv 368 bytes stack frame, 580 bytes spill stores, 788 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_int64_tv 440 bytes stack frame, 740 bytes spill stores, 1076 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Max_int64_tv 400 bytes stack frame, 680 bytes spill stores, 936 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_int64_tv 240 bytes stack frame, 264 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_int64_tv 648 bytes stack frame, 1436 bytes spill stores, 2104 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Max_int64_tv 304 bytes stack frame, 336 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Max_int64_tv 512 bytes stack frame, 368 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Max_int64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Max_int64_tv 352 bytes stack frame, 760 bytes spill stores, 996 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Max_int64_tv 488 bytes stack frame, 304 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Max_int64_tv 352 bytes stack frame, 528 bytes spill stores, 788 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_int32_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Min_int32_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Min_int32_tv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Min_int32_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_int32_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Min_int32_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Min_int32_tv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Min_int32_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_int32_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Min_int32_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Min_int32_tv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Min_int32_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_int32_tv 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Min_int32_tv 304 bytes stack frame, 332 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Min_int32_tv 552 bytes stack frame, 420 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Min_int32_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_int32_tv 336 bytes stack frame, 348 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Min_int32_tv 408 bytes stack frame, 444 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Min_int32_tv 560 bytes stack frame, 432 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Min_int32_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_int32_tv 288 bytes stack frame, 300 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Min_int32_tv 312 bytes stack frame, 340 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Min_int32_tv 528 bytes stack frame, 400 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Min_int32_tv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_int8_tv 272 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Min_int8_tv 272 bytes stack frame, 300 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Min_int8_tv 504 bytes stack frame, 348 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Min_int8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_int8_tv 272 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Min_int8_tv 272 bytes stack frame, 300 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Min_int8_tv 504 bytes stack frame, 348 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Min_int8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_int8_tv 272 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Min_int8_tv 272 bytes stack frame, 300 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Min_int8_tv 504 bytes stack frame, 348 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Min_int8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_int8_tv 288 bytes stack frame, 308 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Min_int8_tv 392 bytes stack frame, 436 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Min_int8_tv 568 bytes stack frame, 460 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Min_int8_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_int8_tv 424 bytes stack frame, 492 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Min_int8_tv 488 bytes stack frame, 676 bytes spill stores, 1256 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Min_int8_tv 568 bytes stack frame, 440 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Min_int8_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_int8_tv 416 bytes stack frame, 556 bytes spill stores, 956 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Min_int8_tv 408 bytes stack frame, 532 bytes spill stores, 828 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Min_int8_tv 536 bytes stack frame, 392 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Min_int8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_uint64_tv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_uint64_tv 496 bytes stack frame, 984 bytes spill stores, 1328 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Min_uint64_tv 248 bytes stack frame, 272 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Min_uint64_tv 520 bytes stack frame, 384 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Min_uint64_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Min_uint64_tv 288 bytes stack frame, 360 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Min_uint64_tv 504 bytes stack frame, 348 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Min_uint64_tv 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_uint64_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_uint64_tv 496 bytes stack frame, 984 bytes spill stores, 1328 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Min_uint64_tv 248 bytes stack frame, 272 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Min_uint64_tv 520 bytes stack frame, 384 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Min_uint64_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Min_uint64_tv 288 bytes stack frame, 360 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Min_uint64_tv 504 bytes stack frame, 348 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Min_uint64_tv 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_uint64_tv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_uint64_tv 496 bytes stack frame, 984 bytes spill stores, 1328 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Min_uint64_tv 248 bytes stack frame, 272 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Min_uint64_tv 520 bytes stack frame, 384 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Min_uint64_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Min_uint64_tv 288 bytes stack frame, 360 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Min_uint64_tv 504 bytes stack frame, 348 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Min_uint64_tv 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_uint64_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_uint64_tv 544 bytes stack frame, 1072 bytes spill stores, 1500 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Min_uint64_tv 296 bytes stack frame, 320 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Min_uint64_tv 560 bytes stack frame, 436 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Min_uint64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Min_uint64_tv 304 bytes stack frame, 416 bytes spill stores, 580 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Min_uint64_tv 544 bytes stack frame, 380 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Min_uint64_tv 360 bytes stack frame, 568 bytes spill stores, 764 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_uint64_tv 296 bytes stack frame, 324 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_uint64_tv 640 bytes stack frame, 1408 bytes spill stores, 2132 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Min_uint64_tv 400 bytes stack frame, 484 bytes spill stores, 1072 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Min_uint64_tv 568 bytes stack frame, 448 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Min_uint64_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Min_uint64_tv 416 bytes stack frame, 820 bytes spill stores, 1256 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Min_uint64_tv 544 bytes stack frame, 380 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Min_uint64_tv 368 bytes stack frame, 580 bytes spill stores, 788 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_uint64_tv 440 bytes stack frame, 740 bytes spill stores, 1076 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Min_uint64_tv 400 bytes stack frame, 680 bytes spill stores, 936 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_uint64_tv 240 bytes stack frame, 264 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_uint64_tv 648 bytes stack frame, 1436 bytes spill stores, 2104 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Min_uint64_tv 304 bytes stack frame, 336 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Min_uint64_tv 512 bytes stack frame, 368 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Min_uint64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Min_uint64_tv 352 bytes stack frame, 760 bytes spill stores, 996 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Min_uint64_tv 488 bytes stack frame, 304 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Min_uint64_tv 352 bytes stack frame, 528 bytes spill stores, 788 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint64_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint64_tv 560 bytes stack frame, 1184 bytes spill stores, 1664 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_uint64_tv 256 bytes stack frame, 276 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_PreMulSum_uint64_tv 504 bytes stack frame, 360 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_PreMulSum_uint64_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_uint64_tv 296 bytes stack frame, 396 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_PreMulSum_uint64_tv 512 bytes stack frame, 352 bytes spill stores, 344 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_PreMulSum_uint64_tv 336 bytes stack frame, 568 bytes spill stores, 704 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint64_tv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint64_tv 560 bytes stack frame, 1184 bytes spill stores, 1664 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_uint64_tv 256 bytes stack frame, 276 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_PreMulSum_uint64_tv 504 bytes stack frame, 360 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_PreMulSum_uint64_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_uint64_tv 296 bytes stack frame, 396 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_PreMulSum_uint64_tv 504 bytes stack frame, 348 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_PreMulSum_uint64_tv 336 bytes stack frame, 568 bytes spill stores, 704 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint64_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint64_tv 560 bytes stack frame, 1184 bytes spill stores, 1664 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_uint64_tv 256 bytes stack frame, 276 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_PreMulSum_uint64_tv 504 bytes stack frame, 360 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_PreMulSum_uint64_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_uint64_tv 296 bytes stack frame, 396 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_PreMulSum_uint64_tv 504 bytes stack frame, 348 bytes spill stores, 364 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_PreMulSum_uint64_tv 336 bytes stack frame, 568 bytes spill stores, 704 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint64_tv 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint64_tv 576 bytes stack frame, 1212 bytes spill stores, 1792 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_uint64_tv 304 bytes stack frame, 328 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_PreMulSum_uint64_tv 560 bytes stack frame, 408 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_PreMulSum_uint64_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_uint64_tv 304 bytes stack frame, 416 bytes spill stores, 580 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_PreMulSum_uint64_tv 552 bytes stack frame, 384 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_PreMulSum_uint64_tv 368 bytes stack frame, 564 bytes spill stores, 784 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint64_tv 304 bytes stack frame, 336 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint64_tv 704 bytes stack frame, 1568 bytes spill stores, 2452 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_uint64_tv 424 bytes stack frame, 512 bytes spill stores, 1316 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_PreMulSum_uint64_tv 560 bytes stack frame, 404 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_PreMulSum_uint64_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_uint64_tv 496 bytes stack frame, 1016 bytes spill stores, 1524 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_PreMulSum_uint64_tv 560 bytes stack frame, 384 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_PreMulSum_uint64_tv 376 bytes stack frame, 588 bytes spill stores, 808 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint64_tv 264 bytes stack frame, 300 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint64_tv 688 bytes stack frame, 1452 bytes spill stores, 2288 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_uint64_tv 320 bytes stack frame, 380 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_PreMulSum_uint64_tv 496 bytes stack frame, 340 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_PreMulSum_uint64_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_uint64_tv 440 bytes stack frame, 988 bytes spill stores, 1352 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_PreMulSum_uint64_tv 496 bytes stack frame, 308 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_PreMulSum_uint64_tv 360 bytes stack frame, 532 bytes spill stores, 808 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_int64_tv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_int64_tv 496 bytes stack frame, 984 bytes spill stores, 1328 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Min_int64_tv 248 bytes stack frame, 272 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Min_int64_tv 520 bytes stack frame, 384 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Min_int64_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Min_int64_tv 288 bytes stack frame, 360 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Min_int64_tv 504 bytes stack frame, 348 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Min_int64_tv 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_int64_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_int64_tv 496 bytes stack frame, 984 bytes spill stores, 1328 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Min_int64_tv 248 bytes stack frame, 272 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Min_int64_tv 520 bytes stack frame, 384 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Min_int64_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Min_int64_tv 288 bytes stack frame, 360 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Min_int64_tv 504 bytes stack frame, 348 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Min_int64_tv 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_int64_tv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_int64_tv 496 bytes stack frame, 984 bytes spill stores, 1328 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Min_int64_tv 248 bytes stack frame, 272 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Min_int64_tv 520 bytes stack frame, 384 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Min_int64_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Min_int64_tv 288 bytes stack frame, 360 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Min_int64_tv 504 bytes stack frame, 348 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Min_int64_tv 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_int64_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_int64_tv 544 bytes stack frame, 1072 bytes spill stores, 1500 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Min_int64_tv 296 bytes stack frame, 320 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Min_int64_tv 560 bytes stack frame, 436 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Min_int64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Min_int64_tv 304 bytes stack frame, 416 bytes spill stores, 580 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Min_int64_tv 544 bytes stack frame, 380 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Min_int64_tv 360 bytes stack frame, 568 bytes spill stores, 764 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_int64_tv 296 bytes stack frame, 324 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_int64_tv 640 bytes stack frame, 1408 bytes spill stores, 2132 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Min_int64_tv 400 bytes stack frame, 484 bytes spill stores, 1072 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Min_int64_tv 568 bytes stack frame, 448 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Min_int64_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Min_int64_tv 416 bytes stack frame, 820 bytes spill stores, 1256 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Min_int64_tv 544 bytes stack frame, 380 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Min_int64_tv 368 bytes stack frame, 580 bytes spill stores, 788 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_int64_tv 440 bytes stack frame, 740 bytes spill stores, 1076 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Min_int64_tv 400 bytes stack frame, 680 bytes spill stores, 936 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_int64_tv 240 bytes stack frame, 264 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_int64_tv 648 bytes stack frame, 1436 bytes spill stores, 2104 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Min_int64_tv 304 bytes stack frame, 336 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Min_int64_tv 512 bytes stack frame, 368 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Min_int64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Min_int64_tv 352 bytes stack frame, 760 bytes spill stores, 996 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Min_int64_tv 488 bytes stack frame, 304 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Min_int64_tv 352 bytes stack frame, 528 bytes spill stores, 788 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod___nv_bfloat16v 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_Prod___nv_bfloat16v 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_Prod___nv_bfloat16v 520 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_Prod___nv_bfloat16v 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod___nv_bfloat16v 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_Prod___nv_bfloat16v 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_Prod___nv_bfloat16v 520 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_Prod___nv_bfloat16v 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod___nv_bfloat16v 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_Prod___nv_bfloat16v 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_Prod___nv_bfloat16v 520 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_Prod___nv_bfloat16v 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod___nv_bfloat16v 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_Prod___nv_bfloat16v 264 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_Prod___nv_bfloat16v 560 bytes stack frame, 428 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_Prod___nv_bfloat16v 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod___nv_bfloat16v 336 bytes stack frame, 348 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_Prod___nv_bfloat16v 384 bytes stack frame, 440 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_Prod___nv_bfloat16v 568 bytes stack frame, 432 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_Prod___nv_bfloat16v 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_Prod___nv_bfloat16v 272 bytes stack frame, 288 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_Prod___nv_bfloat16v 280 bytes stack frame, 304 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_Prod___nv_bfloat16v 536 bytes stack frame, 400 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_Prod___nv_bfloat16v 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_uint64_tv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_uint64_tv 496 bytes stack frame, 984 bytes spill stores, 1328 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Max_uint64_tv 248 bytes stack frame, 272 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Max_uint64_tv 520 bytes stack frame, 384 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Max_uint64_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Max_uint64_tv 288 bytes stack frame, 360 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Max_uint64_tv 504 bytes stack frame, 348 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Max_uint64_tv 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_uint64_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_uint64_tv 496 bytes stack frame, 984 bytes spill stores, 1328 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Max_uint64_tv 248 bytes stack frame, 272 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Max_uint64_tv 520 bytes stack frame, 384 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Max_uint64_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Max_uint64_tv 288 bytes stack frame, 360 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Max_uint64_tv 504 bytes stack frame, 348 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Max_uint64_tv 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_uint64_tv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_uint64_tv 496 bytes stack frame, 984 bytes spill stores, 1328 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Max_uint64_tv 248 bytes stack frame, 272 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Max_uint64_tv 520 bytes stack frame, 384 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Max_uint64_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Max_uint64_tv 288 bytes stack frame, 360 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Max_uint64_tv 504 bytes stack frame, 348 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Max_uint64_tv 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_uint64_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_uint64_tv 544 bytes stack frame, 1072 bytes spill stores, 1500 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Max_uint64_tv 296 bytes stack frame, 320 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Max_uint64_tv 560 bytes stack frame, 436 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Max_uint64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Max_uint64_tv 304 bytes stack frame, 416 bytes spill stores, 580 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Max_uint64_tv 544 bytes stack frame, 380 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Max_uint64_tv 360 bytes stack frame, 568 bytes spill stores, 764 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_uint64_tv 296 bytes stack frame, 324 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_uint64_tv 640 bytes stack frame, 1408 bytes spill stores, 2132 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Max_uint64_tv 400 bytes stack frame, 484 bytes spill stores, 1072 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Max_uint64_tv 568 bytes stack frame, 448 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Max_uint64_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Max_uint64_tv 416 bytes stack frame, 820 bytes spill stores, 1256 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Max_uint64_tv 544 bytes stack frame, 380 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Max_uint64_tv 368 bytes stack frame, 580 bytes spill stores, 788 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_uint64_tv 440 bytes stack frame, 740 bytes spill stores, 1076 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Max_uint64_tv 400 bytes stack frame, 680 bytes spill stores, 936 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_uint64_tv 240 bytes stack frame, 264 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_uint64_tv 648 bytes stack frame, 1436 bytes spill stores, 2104 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Max_uint64_tv 304 bytes stack frame, 336 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Max_uint64_tv 512 bytes stack frame, 368 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Max_uint64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Max_uint64_tv 352 bytes stack frame, 760 bytes spill stores, 996 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Max_uint64_tv 488 bytes stack frame, 304 bytes spill stores, 300 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Max_uint64_tv 352 bytes stack frame, 528 bytes spill stores, 788 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int32_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int32_tv 488 bytes stack frame, 984 bytes spill stores, 1280 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_int32_tv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_PreMulSum_int32_tv 520 bytes stack frame, 384 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_PreMulSum_int32_tv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_int32_tv 320 bytes stack frame, 464 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_PreMulSum_int32_tv 496 bytes stack frame, 344 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_PreMulSum_int32_tv 344 bytes stack frame, 592 bytes spill stores, 752 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int32_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int32_tv 488 bytes stack frame, 984 bytes spill stores, 1280 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_int32_tv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_PreMulSum_int32_tv 520 bytes stack frame, 384 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_PreMulSum_int32_tv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_int32_tv 320 bytes stack frame, 464 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_PreMulSum_int32_tv 496 bytes stack frame, 344 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_PreMulSum_int32_tv 344 bytes stack frame, 592 bytes spill stores, 752 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int32_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int32_tv 488 bytes stack frame, 984 bytes spill stores, 1280 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_int32_tv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_PreMulSum_int32_tv 520 bytes stack frame, 384 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_PreMulSum_int32_tv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_int32_tv 320 bytes stack frame, 464 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_PreMulSum_int32_tv 496 bytes stack frame, 344 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_PreMulSum_int32_tv 344 bytes stack frame, 592 bytes spill stores, 752 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int32_tv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int32_tv 544 bytes stack frame, 1148 bytes spill stores, 1496 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_int32_tv 296 bytes stack frame, 320 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_PreMulSum_int32_tv 560 bytes stack frame, 412 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_PreMulSum_int32_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_int32_tv 352 bytes stack frame, 572 bytes spill stores, 808 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_PreMulSum_int32_tv 560 bytes stack frame, 404 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_PreMulSum_int32_tv 368 bytes stack frame, 604 bytes spill stores, 844 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int32_tv 288 bytes stack frame, 316 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int32_tv 600 bytes stack frame, 1304 bytes spill stores, 2044 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_int32_tv 400 bytes stack frame, 468 bytes spill stores, 1164 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_PreMulSum_int32_tv 560 bytes stack frame, 416 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_PreMulSum_int32_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_int32_tv 528 bytes stack frame, 1072 bytes spill stores, 1604 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_PreMulSum_int32_tv 560 bytes stack frame, 408 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_PreMulSum_int32_tv 384 bytes stack frame, 612 bytes spill stores, 876 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int32_tv 248 bytes stack frame, 272 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int32_tv 568 bytes stack frame, 1236 bytes spill stores, 1960 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_int32_tv 304 bytes stack frame, 348 bytes spill stores, 632 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_PreMulSum_int32_tv 496 bytes stack frame, 356 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_PreMulSum_int32_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_int32_tv 416 bytes stack frame, 892 bytes spill stores, 1244 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_PreMulSum_int32_tv 480 bytes stack frame, 308 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_PreMulSum_int32_tv 368 bytes stack frame, 584 bytes spill stores, 828 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z55ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_floatv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_floatv 488 bytes stack frame, 996 bytes spill stores, 1312 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_floatv 248 bytes stack frame, 268 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL128_PreMulSum_floatv 520 bytes stack frame, 388 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL_PreMulSum_floatv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_floatv 320 bytes stack frame, 460 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL128_PreMulSum_floatv 512 bytes stack frame, 356 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL_PreMulSum_floatv 344 bytes stack frame, 584 bytes spill stores, 764 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z55ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_floatv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_floatv 488 bytes stack frame, 996 bytes spill stores, 1312 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_floatv 248 bytes stack frame, 268 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL128_PreMulSum_floatv 520 bytes stack frame, 388 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL_PreMulSum_floatv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_floatv 320 bytes stack frame, 460 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL128_PreMulSum_floatv 512 bytes stack frame, 356 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL_PreMulSum_floatv 344 bytes stack frame, 584 bytes spill stores, 764 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z55ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_floatv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_floatv 488 bytes stack frame, 996 bytes spill stores, 1312 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_floatv 248 bytes stack frame, 268 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL128_PreMulSum_floatv 520 bytes stack frame, 388 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL_PreMulSum_floatv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_floatv 320 bytes stack frame, 460 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL128_PreMulSum_floatv 512 bytes stack frame, 356 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL_PreMulSum_floatv 344 bytes stack frame, 584 bytes spill stores, 764 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z55ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_floatv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_floatv 536 bytes stack frame, 1060 bytes spill stores, 1424 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_floatv 296 bytes stack frame, 320 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL128_PreMulSum_floatv 560 bytes stack frame, 412 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL_PreMulSum_floatv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_floatv 336 bytes stack frame, 484 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL128_PreMulSum_floatv 560 bytes stack frame, 400 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL_PreMulSum_floatv 368 bytes stack frame, 604 bytes spill stores, 844 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z55ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_floatv 288 bytes stack frame, 316 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_floatv 568 bytes stack frame, 1212 bytes spill stores, 1940 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_floatv 408 bytes stack frame, 464 bytes spill stores, 1232 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL128_PreMulSum_floatv 560 bytes stack frame, 416 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL_PreMulSum_floatv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_floatv 536 bytes stack frame, 1084 bytes spill stores, 1620 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL128_PreMulSum_floatv 560 bytes stack frame, 408 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL_PreMulSum_floatv 384 bytes stack frame, 612 bytes spill stores, 876 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z55ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_floatv 248 bytes stack frame, 272 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_floatv 560 bytes stack frame, 1276 bytes spill stores, 2024 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_floatv 304 bytes stack frame, 344 bytes spill stores, 628 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL128_PreMulSum_floatv 496 bytes stack frame, 356 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL_PreMulSum_floatv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_floatv 408 bytes stack frame, 900 bytes spill stores, 1236 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL128_PreMulSum_floatv 488 bytes stack frame, 324 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL_PreMulSum_floatv 368 bytes stack frame, 576 bytes spill stores, 820 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint32_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint32_tv 488 bytes stack frame, 984 bytes spill stores, 1280 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_uint32_tv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_PreMulSum_uint32_tv 520 bytes stack frame, 384 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_PreMulSum_uint32_tv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_uint32_tv 320 bytes stack frame, 464 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_PreMulSum_uint32_tv 496 bytes stack frame, 344 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_PreMulSum_uint32_tv 344 bytes stack frame, 592 bytes spill stores, 752 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint32_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint32_tv 488 bytes stack frame, 984 bytes spill stores, 1280 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_uint32_tv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_PreMulSum_uint32_tv 520 bytes stack frame, 384 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_PreMulSum_uint32_tv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_uint32_tv 320 bytes stack frame, 464 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_PreMulSum_uint32_tv 496 bytes stack frame, 344 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_PreMulSum_uint32_tv 344 bytes stack frame, 592 bytes spill stores, 752 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint32_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint32_tv 488 bytes stack frame, 984 bytes spill stores, 1280 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_uint32_tv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_PreMulSum_uint32_tv 520 bytes stack frame, 384 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_PreMulSum_uint32_tv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_uint32_tv 320 bytes stack frame, 464 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_PreMulSum_uint32_tv 496 bytes stack frame, 344 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_PreMulSum_uint32_tv 344 bytes stack frame, 592 bytes spill stores, 752 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint32_tv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint32_tv 544 bytes stack frame, 1148 bytes spill stores, 1496 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_uint32_tv 296 bytes stack frame, 320 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_PreMulSum_uint32_tv 560 bytes stack frame, 412 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_PreMulSum_uint32_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_uint32_tv 352 bytes stack frame, 572 bytes spill stores, 808 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_PreMulSum_uint32_tv 560 bytes stack frame, 404 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_PreMulSum_uint32_tv 368 bytes stack frame, 604 bytes spill stores, 844 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint32_tv 288 bytes stack frame, 316 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint32_tv 600 bytes stack frame, 1304 bytes spill stores, 2044 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_uint32_tv 400 bytes stack frame, 468 bytes spill stores, 1164 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_PreMulSum_uint32_tv 560 bytes stack frame, 416 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_PreMulSum_uint32_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_uint32_tv 528 bytes stack frame, 1072 bytes spill stores, 1604 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_PreMulSum_uint32_tv 560 bytes stack frame, 408 bytes spill stores, 584 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_PreMulSum_uint32_tv 384 bytes stack frame, 612 bytes spill stores, 876 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint32_tv 248 bytes stack frame, 272 bytes spill stores, 312 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint32_tv 568 bytes stack frame, 1236 bytes spill stores, 1960 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_uint32_tv 304 bytes stack frame, 348 bytes spill stores, 632 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_PreMulSum_uint32_tv 496 bytes stack frame, 356 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_PreMulSum_uint32_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_uint32_tv 416 bytes stack frame, 892 bytes spill stores, 1244 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_PreMulSum_uint32_tv 480 bytes stack frame, 308 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_PreMulSum_uint32_tv 368 bytes stack frame, 584 bytes spill stores, 828 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_uint8_tv 272 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Min_uint8_tv 272 bytes stack frame, 300 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Min_uint8_tv 504 bytes stack frame, 348 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Min_uint8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_uint8_tv 272 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Min_uint8_tv 272 bytes stack frame, 300 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Min_uint8_tv 504 bytes stack frame, 348 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Min_uint8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_uint8_tv 272 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Min_uint8_tv 272 bytes stack frame, 300 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Min_uint8_tv 504 bytes stack frame, 348 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Min_uint8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_uint8_tv 288 bytes stack frame, 308 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Min_uint8_tv 392 bytes stack frame, 436 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Min_uint8_tv 568 bytes stack frame, 460 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Min_uint8_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_uint8_tv 424 bytes stack frame, 492 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Min_uint8_tv 480 bytes stack frame, 668 bytes spill stores, 1248 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Min_uint8_tv 568 bytes stack frame, 440 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Min_uint8_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_uint8_tv 416 bytes stack frame, 556 bytes spill stores, 956 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Min_uint8_tv 416 bytes stack frame, 528 bytes spill stores, 832 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Min_uint8_tv 544 bytes stack frame, 400 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Min_uint8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_int32_tv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_int32_tv 488 bytes stack frame, 992 bytes spill stores, 1208 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Max_int32_tv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Max_int32_tv 472 bytes stack frame, 332 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Max_int32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Max_int32_tv 320 bytes stack frame, 460 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Max_int32_tv 488 bytes stack frame, 328 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Max_int32_tv 344 bytes stack frame, 592 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_int32_tv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_int32_tv 488 bytes stack frame, 992 bytes spill stores, 1208 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Max_int32_tv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Max_int32_tv 472 bytes stack frame, 332 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Max_int32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Max_int32_tv 320 bytes stack frame, 460 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Max_int32_tv 488 bytes stack frame, 328 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Max_int32_tv 344 bytes stack frame, 592 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_int32_tv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_int32_tv 488 bytes stack frame, 992 bytes spill stores, 1208 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Max_int32_tv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Max_int32_tv 472 bytes stack frame, 332 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Max_int32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Max_int32_tv 320 bytes stack frame, 460 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Max_int32_tv 488 bytes stack frame, 328 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Max_int32_tv 344 bytes stack frame, 592 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_int32_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_int32_tv 536 bytes stack frame, 1148 bytes spill stores, 1496 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Max_int32_tv 288 bytes stack frame, 316 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Max_int32_tv 528 bytes stack frame, 384 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Max_int32_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Max_int32_tv 352 bytes stack frame, 532 bytes spill stores, 756 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Max_int32_tv 560 bytes stack frame, 400 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Max_int32_tv 368 bytes stack frame, 584 bytes spill stores, 832 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_int32_tv 288 bytes stack frame, 312 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_int32_tv 568 bytes stack frame, 1176 bytes spill stores, 1892 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Max_int32_tv 400 bytes stack frame, 472 bytes spill stores, 1112 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Max_int32_tv 528 bytes stack frame, 388 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Max_int32_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Max_int32_tv 496 bytes stack frame, 936 bytes spill stores, 1412 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Max_int32_tv 560 bytes stack frame, 400 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Max_int32_tv 376 bytes stack frame, 596 bytes spill stores, 864 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_int32_tv 440 bytes stack frame, 752 bytes spill stores, 1140 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Max_int32_tv 400 bytes stack frame, 680 bytes spill stores, 936 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_int32_tv 240 bytes stack frame, 268 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_int32_tv 560 bytes stack frame, 1248 bytes spill stores, 2012 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Max_int32_tv 304 bytes stack frame, 336 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Max_int32_tv 488 bytes stack frame, 348 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Max_int32_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Max_int32_tv 416 bytes stack frame, 828 bytes spill stores, 1188 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Max_int32_tv 480 bytes stack frame, 288 bytes spill stores, 272 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Max_int32_tv 360 bytes stack frame, 580 bytes spill stores, 828 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_int32_tv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_int32_tv 488 bytes stack frame, 992 bytes spill stores, 1208 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Min_int32_tv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Min_int32_tv 472 bytes stack frame, 332 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Min_int32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Min_int32_tv 320 bytes stack frame, 460 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Min_int32_tv 488 bytes stack frame, 328 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Min_int32_tv 344 bytes stack frame, 592 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_int32_tv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_int32_tv 488 bytes stack frame, 992 bytes spill stores, 1208 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Min_int32_tv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Min_int32_tv 472 bytes stack frame, 332 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Min_int32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Min_int32_tv 320 bytes stack frame, 460 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Min_int32_tv 488 bytes stack frame, 328 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Min_int32_tv 344 bytes stack frame, 592 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_int32_tv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_int32_tv 488 bytes stack frame, 992 bytes spill stores, 1208 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Min_int32_tv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Min_int32_tv 472 bytes stack frame, 332 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Min_int32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Min_int32_tv 320 bytes stack frame, 460 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Min_int32_tv 488 bytes stack frame, 328 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Min_int32_tv 344 bytes stack frame, 592 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_int32_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_int32_tv 536 bytes stack frame, 1148 bytes spill stores, 1496 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Min_int32_tv 288 bytes stack frame, 316 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Min_int32_tv 528 bytes stack frame, 384 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Min_int32_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Min_int32_tv 352 bytes stack frame, 532 bytes spill stores, 756 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Min_int32_tv 560 bytes stack frame, 400 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Min_int32_tv 368 bytes stack frame, 584 bytes spill stores, 832 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_int32_tv 288 bytes stack frame, 312 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_int32_tv 568 bytes stack frame, 1176 bytes spill stores, 1892 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Min_int32_tv 400 bytes stack frame, 472 bytes spill stores, 1112 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Min_int32_tv 528 bytes stack frame, 388 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Min_int32_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Min_int32_tv 496 bytes stack frame, 936 bytes spill stores, 1412 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Min_int32_tv 560 bytes stack frame, 400 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Min_int32_tv 376 bytes stack frame, 596 bytes spill stores, 864 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_int32_tv 440 bytes stack frame, 752 bytes spill stores, 1140 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Min_int32_tv 400 bytes stack frame, 680 bytes spill stores, 936 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_int32_tv 240 bytes stack frame, 268 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_int32_tv 560 bytes stack frame, 1248 bytes spill stores, 2012 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Min_int32_tv 304 bytes stack frame, 336 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Min_int32_tv 488 bytes stack frame, 348 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Min_int32_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Min_int32_tv 416 bytes stack frame, 828 bytes spill stores, 1188 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Min_int32_tv 480 bytes stack frame, 288 bytes spill stores, 272 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Min_int32_tv 360 bytes stack frame, 580 bytes spill stores, 828 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_uint32_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Min_uint32_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Min_uint32_tv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Min_uint32_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_uint32_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Min_uint32_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Min_uint32_tv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Min_uint32_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_uint32_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Min_uint32_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Min_uint32_tv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Min_uint32_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_uint32_tv 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Min_uint32_tv 304 bytes stack frame, 332 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Min_uint32_tv 552 bytes stack frame, 420 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Min_uint32_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_uint32_tv 336 bytes stack frame, 348 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Min_uint32_tv 408 bytes stack frame, 444 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Min_uint32_tv 560 bytes stack frame, 432 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Min_uint32_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_uint32_tv 288 bytes stack frame, 300 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Min_uint32_tv 312 bytes stack frame, 340 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Min_uint32_tv 528 bytes stack frame, 400 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Min_uint32_tv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z59ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint32_tv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint32_tv 504 bytes stack frame, 1056 bytes spill stores, 1352 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_uint32_tv 248 bytes stack frame, 268 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_LL128_SumPostDiv_uint32_tv 496 bytes stack frame, 360 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_RING_LL_SumPostDiv_uint32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_uint32_tv 304 bytes stack frame, 392 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_LL128_SumPostDiv_uint32_tv 488 bytes stack frame, 336 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_TREE_LL_SumPostDiv_uint32_tv 344 bytes stack frame, 592 bytes spill stores, 752 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z59ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint32_tv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint32_tv 504 bytes stack frame, 1056 bytes spill stores, 1352 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_uint32_tv 248 bytes stack frame, 268 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_LL128_SumPostDiv_uint32_tv 496 bytes stack frame, 360 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_RING_LL_SumPostDiv_uint32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_uint32_tv 304 bytes stack frame, 392 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_LL128_SumPostDiv_uint32_tv 488 bytes stack frame, 328 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_TREE_LL_SumPostDiv_uint32_tv 344 bytes stack frame, 592 bytes spill stores, 752 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z59ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint32_tv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint32_tv 504 bytes stack frame, 1056 bytes spill stores, 1352 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_uint32_tv 248 bytes stack frame, 268 bytes spill stores, 284 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_LL128_SumPostDiv_uint32_tv 496 bytes stack frame, 360 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_RING_LL_SumPostDiv_uint32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_uint32_tv 304 bytes stack frame, 392 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_LL128_SumPostDiv_uint32_tv 488 bytes stack frame, 328 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_TREE_LL_SumPostDiv_uint32_tv 344 bytes stack frame, 592 bytes spill stores, 752 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z59ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint32_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint32_tv 552 bytes stack frame, 1156 bytes spill stores, 1544 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_uint32_tv 288 bytes stack frame, 316 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_LL128_SumPostDiv_uint32_tv 552 bytes stack frame, 428 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_RING_LL_SumPostDiv_uint32_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_uint32_tv 336 bytes stack frame, 528 bytes spill stores, 672 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_LL128_SumPostDiv_uint32_tv 552 bytes stack frame, 400 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_TREE_LL_SumPostDiv_uint32_tv 368 bytes stack frame, 596 bytes spill stores, 836 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z59ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint32_tv 296 bytes stack frame, 360 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint32_tv 568 bytes stack frame, 1212 bytes spill stores, 1952 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_uint32_tv 416 bytes stack frame, 500 bytes spill stores, 1196 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_LL128_SumPostDiv_uint32_tv 560 bytes stack frame, 416 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_RING_LL_SumPostDiv_uint32_tv 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_uint32_tv 504 bytes stack frame, 932 bytes spill stores, 1392 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_LL128_SumPostDiv_uint32_tv 560 bytes stack frame, 400 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_TREE_LL_SumPostDiv_uint32_tv 384 bytes stack frame, 604 bytes spill stores, 868 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z59ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint32_tv 280 bytes stack frame, 308 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint32_tv 560 bytes stack frame, 1264 bytes spill stores, 2044 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_uint32_tv 312 bytes stack frame, 356 bytes spill stores, 668 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_LL128_SumPostDiv_uint32_tv 504 bytes stack frame, 360 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_RING_LL_SumPostDiv_uint32_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_uint32_tv 432 bytes stack frame, 836 bytes spill stores, 1180 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_LL128_SumPostDiv_uint32_tv 480 bytes stack frame, 300 bytes spill stores, 268 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_TREE_LL_SumPostDiv_uint32_tv 368 bytes stack frame, 572 bytes spill stores, 884 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_uint32_tv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_uint32_tv 488 bytes stack frame, 992 bytes spill stores, 1208 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Max_uint32_tv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Max_uint32_tv 472 bytes stack frame, 332 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Max_uint32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Max_uint32_tv 320 bytes stack frame, 460 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Max_uint32_tv 488 bytes stack frame, 328 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Max_uint32_tv 344 bytes stack frame, 592 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_uint32_tv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_uint32_tv 488 bytes stack frame, 992 bytes spill stores, 1208 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Max_uint32_tv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Max_uint32_tv 472 bytes stack frame, 332 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Max_uint32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Max_uint32_tv 320 bytes stack frame, 460 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Max_uint32_tv 488 bytes stack frame, 328 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Max_uint32_tv 344 bytes stack frame, 592 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_uint32_tv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_uint32_tv 488 bytes stack frame, 992 bytes spill stores, 1208 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Max_uint32_tv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Max_uint32_tv 472 bytes stack frame, 332 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Max_uint32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Max_uint32_tv 320 bytes stack frame, 460 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Max_uint32_tv 488 bytes stack frame, 328 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Max_uint32_tv 344 bytes stack frame, 592 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_uint32_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_uint32_tv 536 bytes stack frame, 1148 bytes spill stores, 1496 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Max_uint32_tv 288 bytes stack frame, 316 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Max_uint32_tv 528 bytes stack frame, 384 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Max_uint32_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Max_uint32_tv 352 bytes stack frame, 532 bytes spill stores, 756 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Max_uint32_tv 560 bytes stack frame, 400 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Max_uint32_tv 368 bytes stack frame, 584 bytes spill stores, 832 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_uint32_tv 288 bytes stack frame, 312 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_uint32_tv 568 bytes stack frame, 1176 bytes spill stores, 1892 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Max_uint32_tv 400 bytes stack frame, 472 bytes spill stores, 1112 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Max_uint32_tv 528 bytes stack frame, 388 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Max_uint32_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Max_uint32_tv 496 bytes stack frame, 936 bytes spill stores, 1412 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Max_uint32_tv 560 bytes stack frame, 400 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Max_uint32_tv 376 bytes stack frame, 596 bytes spill stores, 864 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_uint32_tv 440 bytes stack frame, 752 bytes spill stores, 1140 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Max_uint32_tv 400 bytes stack frame, 680 bytes spill stores, 936 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_uint32_tv 240 bytes stack frame, 268 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_uint32_tv 560 bytes stack frame, 1248 bytes spill stores, 2012 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Max_uint32_tv 304 bytes stack frame, 336 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Max_uint32_tv 488 bytes stack frame, 348 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Max_uint32_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Max_uint32_tv 416 bytes stack frame, 828 bytes spill stores, 1188 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Max_uint32_tv 480 bytes stack frame, 288 bytes spill stores, 272 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Max_uint32_tv 360 bytes stack frame, 580 bytes spill stores, 828 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int64_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int64_tv 544 bytes stack frame, 1168 bytes spill stores, 1760 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_int64_tv 320 bytes stack frame, 344 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_SumPostDiv_int64_tv 520 bytes stack frame, 388 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_SumPostDiv_int64_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_int64_tv 312 bytes stack frame, 400 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_SumPostDiv_int64_tv 512 bytes stack frame, 384 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_SumPostDiv_int64_tv 384 bytes stack frame, 552 bytes spill stores, 796 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int64_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int64_tv 544 bytes stack frame, 1168 bytes spill stores, 1760 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_int64_tv 320 bytes stack frame, 344 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_SumPostDiv_int64_tv 520 bytes stack frame, 388 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_SumPostDiv_int64_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_int64_tv 312 bytes stack frame, 400 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_SumPostDiv_int64_tv 512 bytes stack frame, 384 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_SumPostDiv_int64_tv 384 bytes stack frame, 552 bytes spill stores, 796 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int64_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int64_tv 544 bytes stack frame, 1168 bytes spill stores, 1760 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_int64_tv 320 bytes stack frame, 344 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_SumPostDiv_int64_tv 520 bytes stack frame, 388 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_SumPostDiv_int64_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_int64_tv 312 bytes stack frame, 400 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_SumPostDiv_int64_tv 512 bytes stack frame, 384 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_SumPostDiv_int64_tv 384 bytes stack frame, 552 bytes spill stores, 796 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int64_tv 224 bytes stack frame, 220 bytes spill stores, 220 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int64_tv 592 bytes stack frame, 1276 bytes spill stores, 2000 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_int64_tv 344 bytes stack frame, 380 bytes spill stores, 620 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_SumPostDiv_int64_tv 560 bytes stack frame, 436 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_SumPostDiv_int64_tv 216 bytes stack frame, 212 bytes spill stores, 212 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_int64_tv 344 bytes stack frame, 492 bytes spill stores, 652 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_SumPostDiv_int64_tv 536 bytes stack frame, 384 bytes spill stores, 428 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_SumPostDiv_int64_tv 400 bytes stack frame, 572 bytes spill stores, 868 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int64_tv 368 bytes stack frame, 424 bytes spill stores, 832 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int64_tv 712 bytes stack frame, 1700 bytes spill stores, 2596 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_int64_tv 416 bytes stack frame, 484 bytes spill stores, 1140 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_SumPostDiv_int64_tv 568 bytes stack frame, 448 bytes spill stores, 596 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_SumPostDiv_int64_tv 216 bytes stack frame, 216 bytes spill stores, 216 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_int64_tv 496 bytes stack frame, 1032 bytes spill stores, 1548 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_SumPostDiv_int64_tv 544 bytes stack frame, 396 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_SumPostDiv_int64_tv 408 bytes stack frame, 580 bytes spill stores, 892 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int64_tv 336 bytes stack frame, 360 bytes spill stores, 696 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int64_tv 704 bytes stack frame, 1672 bytes spill stores, 2560 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_int64_tv 352 bytes stack frame, 424 bytes spill stores, 800 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_SumPostDiv_int64_tv 512 bytes stack frame, 368 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_SumPostDiv_int64_tv 216 bytes stack frame, 216 bytes spill stores, 216 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_int64_tv 464 bytes stack frame, 1044 bytes spill stores, 1536 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_SumPostDiv_int64_tv 512 bytes stack frame, 328 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_SumPostDiv_int64_tv 424 bytes stack frame, 568 bytes spill stores, 852 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z59ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint64_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint64_tv 560 bytes stack frame, 1196 bytes spill stores, 1672 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_uint64_tv 280 bytes stack frame, 316 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_LL128_SumPostDiv_uint64_tv 520 bytes stack frame, 388 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_RING_LL_SumPostDiv_uint64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_uint64_tv 296 bytes stack frame, 376 bytes spill stores, 532 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_LL128_SumPostDiv_uint64_tv 512 bytes stack frame, 356 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_TREE_LL_SumPostDiv_uint64_tv 368 bytes stack frame, 548 bytes spill stores, 760 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z59ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint64_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint64_tv 560 bytes stack frame, 1196 bytes spill stores, 1672 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_uint64_tv 280 bytes stack frame, 316 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_LL128_SumPostDiv_uint64_tv 520 bytes stack frame, 388 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_RING_LL_SumPostDiv_uint64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_uint64_tv 296 bytes stack frame, 376 bytes spill stores, 532 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_LL128_SumPostDiv_uint64_tv 512 bytes stack frame, 356 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_TREE_LL_SumPostDiv_uint64_tv 368 bytes stack frame, 548 bytes spill stores, 760 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z59ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint64_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint64_tv 560 bytes stack frame, 1196 bytes spill stores, 1672 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_uint64_tv 280 bytes stack frame, 316 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_LL128_SumPostDiv_uint64_tv 520 bytes stack frame, 388 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_RING_LL_SumPostDiv_uint64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_uint64_tv 296 bytes stack frame, 376 bytes spill stores, 532 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_LL128_SumPostDiv_uint64_tv 512 bytes stack frame, 356 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_TREE_LL_SumPostDiv_uint64_tv 368 bytes stack frame, 548 bytes spill stores, 760 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z59ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint64_tv 216 bytes stack frame, 212 bytes spill stores, 212 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint64_tv 592 bytes stack frame, 1224 bytes spill stores, 1848 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_uint64_tv 344 bytes stack frame, 376 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_LL128_SumPostDiv_uint64_tv 560 bytes stack frame, 436 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_RING_LL_SumPostDiv_uint64_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_uint64_tv 320 bytes stack frame, 416 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_LL128_SumPostDiv_uint64_tv 536 bytes stack frame, 372 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_TREE_LL_SumPostDiv_uint64_tv 376 bytes stack frame, 552 bytes spill stores, 820 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z59ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint64_tv 368 bytes stack frame, 412 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint64_tv 712 bytes stack frame, 1652 bytes spill stores, 2476 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_uint64_tv 408 bytes stack frame, 472 bytes spill stores, 1120 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_LL128_SumPostDiv_uint64_tv 568 bytes stack frame, 448 bytes spill stores, 596 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_RING_LL_SumPostDiv_uint64_tv 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_uint64_tv 496 bytes stack frame, 1028 bytes spill stores, 1544 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_LL128_SumPostDiv_uint64_tv 544 bytes stack frame, 388 bytes spill stores, 432 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_TREE_LL_SumPostDiv_uint64_tv 384 bytes stack frame, 560 bytes spill stores, 848 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z59ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint64_tv 320 bytes stack frame, 372 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint64_tv 704 bytes stack frame, 1620 bytes spill stores, 2456 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_uint64_tv 312 bytes stack frame, 352 bytes spill stores, 652 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_LL128_SumPostDiv_uint64_tv 512 bytes stack frame, 368 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_RING_LL_SumPostDiv_uint64_tv 208 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_uint64_tv 464 bytes stack frame, 996 bytes spill stores, 1436 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_LL128_SumPostDiv_uint64_tv 488 bytes stack frame, 284 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_TREE_LL_SumPostDiv_uint64_tv 400 bytes stack frame, 580 bytes spill stores, 844 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z44ncclKernel_AllReduce_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z44ncclKernel_AllReduce_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclKernel_AllReduce_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z48ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z49ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclKernel_AllReduce_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 95 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclKernel_AllReduce_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 152 bytes stack frame, 324 bytes spill stores, 492 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_doublev 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_doublev 520 bytes stack frame, 1024 bytes spill stores, 1416 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Sum_doublev 232 bytes stack frame, 252 bytes spill stores, 252 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Sum_doublev 496 bytes stack frame, 348 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Sum_doublev 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Sum_doublev 320 bytes stack frame, 396 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Sum_doublev 496 bytes stack frame, 336 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Sum_doublev 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z44ncclKernel_AllReduce_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z44ncclKernel_AllReduce_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclKernel_AllReduce_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z48ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z49ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclKernel_AllReduce_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 95 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclKernel_AllReduce_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 152 bytes stack frame, 324 bytes spill stores, 492 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_doublev 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_doublev 520 bytes stack frame, 1024 bytes spill stores, 1416 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Sum_doublev 232 bytes stack frame, 252 bytes spill stores, 252 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Sum_doublev 496 bytes stack frame, 348 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Sum_doublev 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Sum_doublev 320 bytes stack frame, 396 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Sum_doublev 496 bytes stack frame, 336 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Sum_doublev 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z44ncclKernel_AllReduce_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z44ncclKernel_AllReduce_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclKernel_AllReduce_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z48ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z49ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclKernel_AllReduce_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 95 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclKernel_AllReduce_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 152 bytes stack frame, 324 bytes spill stores, 492 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_doublev 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_doublev 520 bytes stack frame, 1024 bytes spill stores, 1416 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Sum_doublev 232 bytes stack frame, 252 bytes spill stores, 252 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Sum_doublev 496 bytes stack frame, 348 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Sum_doublev 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Sum_doublev 320 bytes stack frame, 396 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Sum_doublev 496 bytes stack frame, 336 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Sum_doublev 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z44ncclKernel_AllReduce_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z44ncclKernel_AllReduce_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclKernel_AllReduce_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z48ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z49ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclKernel_AllReduce_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclKernel_AllReduce_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 168 bytes stack frame, 320 bytes spill stores, 540 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_doublev 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_doublev 552 bytes stack frame, 1096 bytes spill stores, 1632 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Sum_doublev 288 bytes stack frame, 312 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Sum_doublev 560 bytes stack frame, 436 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Sum_doublev 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Sum_doublev 344 bytes stack frame, 484 bytes spill stores, 684 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Sum_doublev 568 bytes stack frame, 408 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Sum_doublev 360 bytes stack frame, 560 bytes spill stores, 764 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z44ncclKernel_AllReduce_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z44ncclKernel_AllReduce_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclKernel_AllReduce_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z48ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z49ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclKernel_AllReduce_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclKernel_AllReduce_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 160 bytes stack frame, 316 bytes spill stores, 548 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_doublev 240 bytes stack frame, 260 bytes spill stores, 252 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_doublev 696 bytes stack frame, 1556 bytes spill stores, 2396 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Sum_doublev 360 bytes stack frame, 396 bytes spill stores, 788 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Sum_doublev 544 bytes stack frame, 396 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Sum_doublev 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Sum_doublev 472 bytes stack frame, 960 bytes spill stores, 1496 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Sum_doublev 568 bytes stack frame, 408 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Sum_doublev 368 bytes stack frame, 580 bytes spill stores, 788 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z44ncclKernel_AllReduce_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z44ncclKernel_AllReduce_NVLS_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclKernel_AllReduce_NVLS_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z48ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z48ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z49ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z49ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclKernel_AllReduce_RING_LL_Sum_doubleP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclKernel_AllReduce_TREE_LL_Sum_doubleP11ncclDevCommmP8ncclWork 144 bytes stack frame, 284 bytes spill stores, 480 bytes spill loads ptxas info : Used 96 registers ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_doublev 440 bytes stack frame, 752 bytes spill stores, 1140 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Sum_doublev 400 bytes stack frame, 680 bytes spill stores, 936 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_doublev 224 bytes stack frame, 244 bytes spill stores, 252 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_doublev 688 bytes stack frame, 1540 bytes spill stores, 2392 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Sum_doublev 280 bytes stack frame, 296 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Sum_doublev 488 bytes stack frame, 344 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Sum_doublev 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Sum_doublev 432 bytes stack frame, 864 bytes spill stores, 1212 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Sum_doublev 480 bytes stack frame, 296 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Sum_doublev 352 bytes stack frame, 528 bytes spill stores, 788 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_uint32_tv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_uint32_tv 488 bytes stack frame, 992 bytes spill stores, 1208 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Min_uint32_tv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Min_uint32_tv 472 bytes stack frame, 332 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Min_uint32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Min_uint32_tv 320 bytes stack frame, 460 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Min_uint32_tv 488 bytes stack frame, 328 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Min_uint32_tv 344 bytes stack frame, 592 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_uint32_tv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_uint32_tv 488 bytes stack frame, 992 bytes spill stores, 1208 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Min_uint32_tv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Min_uint32_tv 472 bytes stack frame, 332 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Min_uint32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Min_uint32_tv 320 bytes stack frame, 460 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Min_uint32_tv 488 bytes stack frame, 328 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Min_uint32_tv 344 bytes stack frame, 592 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_uint32_tv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_uint32_tv 488 bytes stack frame, 992 bytes spill stores, 1208 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Min_uint32_tv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Min_uint32_tv 472 bytes stack frame, 332 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Min_uint32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Min_uint32_tv 320 bytes stack frame, 460 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Min_uint32_tv 488 bytes stack frame, 328 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Min_uint32_tv 344 bytes stack frame, 592 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_uint32_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_uint32_tv 536 bytes stack frame, 1148 bytes spill stores, 1496 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Min_uint32_tv 288 bytes stack frame, 316 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Min_uint32_tv 528 bytes stack frame, 384 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Min_uint32_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Min_uint32_tv 352 bytes stack frame, 532 bytes spill stores, 756 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Min_uint32_tv 560 bytes stack frame, 400 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Min_uint32_tv 368 bytes stack frame, 584 bytes spill stores, 832 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_uint32_tv 288 bytes stack frame, 312 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_uint32_tv 568 bytes stack frame, 1176 bytes spill stores, 1892 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Min_uint32_tv 400 bytes stack frame, 472 bytes spill stores, 1112 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Min_uint32_tv 528 bytes stack frame, 388 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Min_uint32_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Min_uint32_tv 496 bytes stack frame, 936 bytes spill stores, 1412 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Min_uint32_tv 560 bytes stack frame, 400 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Min_uint32_tv 376 bytes stack frame, 596 bytes spill stores, 864 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_uint32_tv 440 bytes stack frame, 752 bytes spill stores, 1140 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Min_uint32_tv 400 bytes stack frame, 680 bytes spill stores, 936 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_uint32_tv 240 bytes stack frame, 268 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_uint32_tv 560 bytes stack frame, 1248 bytes spill stores, 2012 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Min_uint32_tv 304 bytes stack frame, 336 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Min_uint32_tv 488 bytes stack frame, 348 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Min_uint32_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Min_uint32_tv 416 bytes stack frame, 828 bytes spill stores, 1188 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Min_uint32_tv 480 bytes stack frame, 288 bytes spill stores, 272 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Min_uint32_tv 360 bytes stack frame, 580 bytes spill stores, 828 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int32_tv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int32_tv 504 bytes stack frame, 1060 bytes spill stores, 1360 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_int32_tv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_SumPostDiv_int32_tv 496 bytes stack frame, 360 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_SumPostDiv_int32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_int32_tv 304 bytes stack frame, 396 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_SumPostDiv_int32_tv 488 bytes stack frame, 336 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_SumPostDiv_int32_tv 344 bytes stack frame, 592 bytes spill stores, 752 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int32_tv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int32_tv 504 bytes stack frame, 1060 bytes spill stores, 1360 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_int32_tv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_SumPostDiv_int32_tv 496 bytes stack frame, 360 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_SumPostDiv_int32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_int32_tv 304 bytes stack frame, 396 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_SumPostDiv_int32_tv 488 bytes stack frame, 328 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_SumPostDiv_int32_tv 344 bytes stack frame, 592 bytes spill stores, 752 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int32_tv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int32_tv 504 bytes stack frame, 1060 bytes spill stores, 1360 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_int32_tv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_SumPostDiv_int32_tv 496 bytes stack frame, 360 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_SumPostDiv_int32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_int32_tv 304 bytes stack frame, 396 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_SumPostDiv_int32_tv 488 bytes stack frame, 328 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_SumPostDiv_int32_tv 344 bytes stack frame, 592 bytes spill stores, 752 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int32_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int32_tv 552 bytes stack frame, 1156 bytes spill stores, 1544 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_int32_tv 288 bytes stack frame, 316 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_SumPostDiv_int32_tv 552 bytes stack frame, 428 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_SumPostDiv_int32_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_int32_tv 336 bytes stack frame, 536 bytes spill stores, 696 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_SumPostDiv_int32_tv 552 bytes stack frame, 400 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_SumPostDiv_int32_tv 368 bytes stack frame, 596 bytes spill stores, 836 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int32_tv 296 bytes stack frame, 328 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int32_tv 568 bytes stack frame, 1268 bytes spill stores, 1896 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_int32_tv 400 bytes stack frame, 448 bytes spill stores, 1136 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_SumPostDiv_int32_tv 560 bytes stack frame, 416 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_SumPostDiv_int32_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_int32_tv 504 bytes stack frame, 932 bytes spill stores, 1392 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_SumPostDiv_int32_tv 560 bytes stack frame, 400 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_SumPostDiv_int32_tv 384 bytes stack frame, 604 bytes spill stores, 868 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int32_tv 256 bytes stack frame, 304 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int32_tv 560 bytes stack frame, 1248 bytes spill stores, 1960 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_int32_tv 320 bytes stack frame, 380 bytes spill stores, 676 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_SumPostDiv_int32_tv 504 bytes stack frame, 360 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_SumPostDiv_int32_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_int32_tv 432 bytes stack frame, 836 bytes spill stores, 1180 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_SumPostDiv_int32_tv 480 bytes stack frame, 304 bytes spill stores, 272 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_SumPostDiv_int32_tv 368 bytes stack frame, 572 bytes spill stores, 884 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z40ncclKernel_AllReduce_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z40ncclKernel_AllReduce_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 95 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z40ncclKernel_AllReduce_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 152 bytes stack frame, 324 bytes spill stores, 492 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_int64_tv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_int64_tv 560 bytes stack frame, 1228 bytes spill stores, 1728 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Sum_int64_tv 248 bytes stack frame, 272 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Sum_int64_tv 520 bytes stack frame, 388 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Sum_int64_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Sum_int64_tv 296 bytes stack frame, 372 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Sum_int64_tv 504 bytes stack frame, 348 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Sum_int64_tv 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z40ncclKernel_AllReduce_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z40ncclKernel_AllReduce_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 95 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z40ncclKernel_AllReduce_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 152 bytes stack frame, 324 bytes spill stores, 492 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_int64_tv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_int64_tv 560 bytes stack frame, 1228 bytes spill stores, 1728 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Sum_int64_tv 248 bytes stack frame, 272 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Sum_int64_tv 520 bytes stack frame, 388 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Sum_int64_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Sum_int64_tv 296 bytes stack frame, 372 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Sum_int64_tv 504 bytes stack frame, 348 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Sum_int64_tv 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z40ncclKernel_AllReduce_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z40ncclKernel_AllReduce_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 95 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z40ncclKernel_AllReduce_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 152 bytes stack frame, 324 bytes spill stores, 492 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_int64_tv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_int64_tv 560 bytes stack frame, 1228 bytes spill stores, 1728 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Sum_int64_tv 248 bytes stack frame, 272 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Sum_int64_tv 520 bytes stack frame, 388 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Sum_int64_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Sum_int64_tv 296 bytes stack frame, 372 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Sum_int64_tv 504 bytes stack frame, 348 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Sum_int64_tv 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z40ncclKernel_AllReduce_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z40ncclKernel_AllReduce_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z40ncclKernel_AllReduce_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 168 bytes stack frame, 320 bytes spill stores, 540 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_int64_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_int64_tv 576 bytes stack frame, 1236 bytes spill stores, 1800 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Sum_int64_tv 296 bytes stack frame, 320 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Sum_int64_tv 560 bytes stack frame, 436 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Sum_int64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Sum_int64_tv 304 bytes stack frame, 416 bytes spill stores, 580 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Sum_int64_tv 544 bytes stack frame, 388 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Sum_int64_tv 360 bytes stack frame, 568 bytes spill stores, 764 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z40ncclKernel_AllReduce_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z40ncclKernel_AllReduce_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z40ncclKernel_AllReduce_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 160 bytes stack frame, 316 bytes spill stores, 548 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_int64_tv 296 bytes stack frame, 324 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_int64_tv 704 bytes stack frame, 1600 bytes spill stores, 2400 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Sum_int64_tv 400 bytes stack frame, 468 bytes spill stores, 1104 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Sum_int64_tv 568 bytes stack frame, 448 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Sum_int64_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Sum_int64_tv 496 bytes stack frame, 1016 bytes spill stores, 1524 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Sum_int64_tv 544 bytes stack frame, 380 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Sum_int64_tv 368 bytes stack frame, 580 bytes spill stores, 788 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z40ncclKernel_AllReduce_NVLS_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z40ncclKernel_AllReduce_RING_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z40ncclKernel_AllReduce_TREE_LL_Sum_int64_tP11ncclDevCommmP8ncclWork 144 bytes stack frame, 284 bytes spill stores, 480 bytes spill loads ptxas info : Used 96 registers ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_int64_tv 440 bytes stack frame, 740 bytes spill stores, 1076 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Sum_int64_tv 400 bytes stack frame, 680 bytes spill stores, 936 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_int64_tv 240 bytes stack frame, 264 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_int64_tv 688 bytes stack frame, 1560 bytes spill stores, 2348 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Sum_int64_tv 304 bytes stack frame, 336 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Sum_int64_tv 512 bytes stack frame, 368 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Sum_int64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Sum_int64_tv 440 bytes stack frame, 948 bytes spill stores, 1372 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Sum_int64_tv 504 bytes stack frame, 324 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Sum_int64_tv 352 bytes stack frame, 528 bytes spill stores, 788 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z46ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z41ncclKernel_AllReduce_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z50ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z51ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z51ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z41ncclKernel_AllReduce_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 95 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z41ncclKernel_AllReduce_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 152 bytes stack frame, 324 bytes spill stores, 492 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_uint64_tv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_uint64_tv 560 bytes stack frame, 1228 bytes spill stores, 1728 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Sum_uint64_tv 248 bytes stack frame, 272 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Sum_uint64_tv 520 bytes stack frame, 388 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Sum_uint64_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Sum_uint64_tv 296 bytes stack frame, 372 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Sum_uint64_tv 504 bytes stack frame, 348 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Sum_uint64_tv 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z46ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z41ncclKernel_AllReduce_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z50ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z51ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z51ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z41ncclKernel_AllReduce_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 95 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z41ncclKernel_AllReduce_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 152 bytes stack frame, 324 bytes spill stores, 492 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_uint64_tv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_uint64_tv 560 bytes stack frame, 1228 bytes spill stores, 1728 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Sum_uint64_tv 248 bytes stack frame, 272 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Sum_uint64_tv 520 bytes stack frame, 388 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Sum_uint64_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Sum_uint64_tv 296 bytes stack frame, 372 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Sum_uint64_tv 504 bytes stack frame, 348 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Sum_uint64_tv 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z46ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z41ncclKernel_AllReduce_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z50ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z51ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z51ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z41ncclKernel_AllReduce_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 95 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z41ncclKernel_AllReduce_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 152 bytes stack frame, 324 bytes spill stores, 492 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_uint64_tv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_uint64_tv 560 bytes stack frame, 1228 bytes spill stores, 1728 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Sum_uint64_tv 248 bytes stack frame, 272 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Sum_uint64_tv 520 bytes stack frame, 388 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Sum_uint64_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Sum_uint64_tv 296 bytes stack frame, 372 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Sum_uint64_tv 504 bytes stack frame, 348 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Sum_uint64_tv 328 bytes stack frame, 556 bytes spill stores, 668 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z46ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z41ncclKernel_AllReduce_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z50ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z51ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z51ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z41ncclKernel_AllReduce_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z41ncclKernel_AllReduce_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 168 bytes stack frame, 320 bytes spill stores, 540 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_uint64_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_uint64_tv 576 bytes stack frame, 1236 bytes spill stores, 1800 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Sum_uint64_tv 296 bytes stack frame, 320 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Sum_uint64_tv 560 bytes stack frame, 436 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Sum_uint64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Sum_uint64_tv 304 bytes stack frame, 416 bytes spill stores, 580 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Sum_uint64_tv 544 bytes stack frame, 388 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Sum_uint64_tv 360 bytes stack frame, 568 bytes spill stores, 764 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z46ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z41ncclKernel_AllReduce_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z50ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z51ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z51ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z41ncclKernel_AllReduce_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z41ncclKernel_AllReduce_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 160 bytes stack frame, 316 bytes spill stores, 548 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_uint64_tv 296 bytes stack frame, 324 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_uint64_tv 704 bytes stack frame, 1600 bytes spill stores, 2400 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Sum_uint64_tv 400 bytes stack frame, 468 bytes spill stores, 1104 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Sum_uint64_tv 568 bytes stack frame, 448 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Sum_uint64_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Sum_uint64_tv 496 bytes stack frame, 1016 bytes spill stores, 1524 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Sum_uint64_tv 544 bytes stack frame, 380 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Sum_uint64_tv 368 bytes stack frame, 580 bytes spill stores, 788 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z46ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z41ncclKernel_AllReduce_NVLS_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z50ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z50ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z51ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z51ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z41ncclKernel_AllReduce_RING_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z41ncclKernel_AllReduce_TREE_LL_Sum_uint64_tP11ncclDevCommmP8ncclWork 144 bytes stack frame, 284 bytes spill stores, 480 bytes spill loads ptxas info : Used 96 registers ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_uint64_tv 440 bytes stack frame, 740 bytes spill stores, 1076 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Sum_uint64_tv 400 bytes stack frame, 680 bytes spill stores, 936 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_uint64_tv 240 bytes stack frame, 264 bytes spill stores, 280 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_uint64_tv 688 bytes stack frame, 1560 bytes spill stores, 2348 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Sum_uint64_tv 304 bytes stack frame, 336 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Sum_uint64_tv 512 bytes stack frame, 368 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Sum_uint64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Sum_uint64_tv 440 bytes stack frame, 948 bytes spill stores, 1372 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Sum_uint64_tv 504 bytes stack frame, 324 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Sum_uint64_tv 352 bytes stack frame, 528 bytes spill stores, 788 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling functions.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/functions.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/functions.o` /usr/local/cuda/bin/nvcc -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc functions.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/functions.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 43288 bytes gmem ptxas info : Function properties for _Z25ncclWorkaroundClangD55580v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 43288 bytes gmem ptxas info : Function properties for _Z25ncclWorkaroundClangD55580v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 43288 bytes gmem ptxas info : Function properties for _Z25ncclWorkaroundClangD55580v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 43288 bytes gmem ptxas info : Function properties for _Z25ncclWorkaroundClangD55580v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 43288 bytes gmem ptxas info : Function properties for _Z25ncclWorkaroundClangD55580v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 43288 bytes gmem ptxas info : Function properties for _Z25ncclWorkaroundClangD55580v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z46ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z41ncclKernel_AllReduce_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z50ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z51ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z51ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z41ncclKernel_AllReduce_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z41ncclKernel_AllReduce_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 160 bytes stack frame, 316 bytes spill stores, 500 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_uint32_tv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_uint32_tv 488 bytes stack frame, 988 bytes spill stores, 1248 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Sum_uint32_tv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Sum_uint32_tv 496 bytes stack frame, 360 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Sum_uint32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Sum_uint32_tv 320 bytes stack frame, 460 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Sum_uint32_tv 488 bytes stack frame, 328 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Sum_uint32_tv 344 bytes stack frame, 592 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z46ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z41ncclKernel_AllReduce_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z50ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z51ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z51ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z41ncclKernel_AllReduce_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z41ncclKernel_AllReduce_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 160 bytes stack frame, 316 bytes spill stores, 500 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_uint32_tv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_uint32_tv 488 bytes stack frame, 988 bytes spill stores, 1248 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Sum_uint32_tv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Sum_uint32_tv 496 bytes stack frame, 360 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Sum_uint32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Sum_uint32_tv 320 bytes stack frame, 460 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Sum_uint32_tv 488 bytes stack frame, 328 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Sum_uint32_tv 344 bytes stack frame, 592 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z46ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z41ncclKernel_AllReduce_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z50ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z51ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z51ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z41ncclKernel_AllReduce_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z41ncclKernel_AllReduce_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 160 bytes stack frame, 316 bytes spill stores, 500 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_uint32_tv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_uint32_tv 488 bytes stack frame, 988 bytes spill stores, 1248 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Sum_uint32_tv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Sum_uint32_tv 496 bytes stack frame, 360 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Sum_uint32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Sum_uint32_tv 320 bytes stack frame, 460 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Sum_uint32_tv 488 bytes stack frame, 328 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Sum_uint32_tv 344 bytes stack frame, 592 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z46ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z41ncclKernel_AllReduce_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z50ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z51ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z51ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z41ncclKernel_AllReduce_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z41ncclKernel_AllReduce_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 176 bytes stack frame, 344 bytes spill stores, 540 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_uint32_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_uint32_tv 536 bytes stack frame, 1148 bytes spill stores, 1496 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Sum_uint32_tv 288 bytes stack frame, 316 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Sum_uint32_tv 552 bytes stack frame, 412 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Sum_uint32_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Sum_uint32_tv 352 bytes stack frame, 580 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Sum_uint32_tv 552 bytes stack frame, 400 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Sum_uint32_tv 368 bytes stack frame, 592 bytes spill stores, 840 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z46ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z41ncclKernel_AllReduce_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z50ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z51ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z51ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z41ncclKernel_AllReduce_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z41ncclKernel_AllReduce_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 176 bytes stack frame, 356 bytes spill stores, 552 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_uint32_tv 288 bytes stack frame, 312 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_uint32_tv 568 bytes stack frame, 1180 bytes spill stores, 1792 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Sum_uint32_tv 400 bytes stack frame, 472 bytes spill stores, 1112 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Sum_uint32_tv 560 bytes stack frame, 416 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Sum_uint32_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Sum_uint32_tv 488 bytes stack frame, 932 bytes spill stores, 1392 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Sum_uint32_tv 560 bytes stack frame, 400 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Sum_uint32_tv 376 bytes stack frame, 596 bytes spill stores, 864 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z46ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z41ncclKernel_AllReduce_NVLS_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z50ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z50ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z51ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z51ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z41ncclKernel_AllReduce_RING_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_Z41ncclKernel_AllReduce_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z41ncclKernel_AllReduce_TREE_LL_Sum_uint32_tP11ncclDevCommmP8ncclWork 152 bytes stack frame, 292 bytes spill stores, 480 bytes spill loads ptxas info : Used 96 registers ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_uint32_tv 440 bytes stack frame, 796 bytes spill stores, 1228 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Sum_uint32_tv 400 bytes stack frame, 680 bytes spill stores, 936 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_uint32_tv 240 bytes stack frame, 268 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_uint32_tv 560 bytes stack frame, 1228 bytes spill stores, 1972 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Sum_uint32_tv 304 bytes stack frame, 336 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Sum_uint32_tv 504 bytes stack frame, 356 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Sum_uint32_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Sum_uint32_tv 416 bytes stack frame, 828 bytes spill stores, 1240 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Sum_uint32_tv 480 bytes stack frame, 288 bytes spill stores, 272 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Sum_uint32_tv 360 bytes stack frame, 580 bytes spill stores, 828 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z40ncclKernel_AllReduce_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z40ncclKernel_AllReduce_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z40ncclKernel_AllReduce_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 160 bytes stack frame, 316 bytes spill stores, 500 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_int32_tv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_int32_tv 488 bytes stack frame, 988 bytes spill stores, 1248 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Sum_int32_tv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Sum_int32_tv 496 bytes stack frame, 360 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Sum_int32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Sum_int32_tv 320 bytes stack frame, 460 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Sum_int32_tv 488 bytes stack frame, 328 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Sum_int32_tv 344 bytes stack frame, 592 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z40ncclKernel_AllReduce_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z40ncclKernel_AllReduce_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z40ncclKernel_AllReduce_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 160 bytes stack frame, 316 bytes spill stores, 500 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_int32_tv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_int32_tv 488 bytes stack frame, 988 bytes spill stores, 1248 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Sum_int32_tv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Sum_int32_tv 496 bytes stack frame, 360 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Sum_int32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Sum_int32_tv 320 bytes stack frame, 460 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Sum_int32_tv 488 bytes stack frame, 328 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Sum_int32_tv 344 bytes stack frame, 592 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z40ncclKernel_AllReduce_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z40ncclKernel_AllReduce_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z40ncclKernel_AllReduce_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 160 bytes stack frame, 316 bytes spill stores, 500 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_int32_tv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_int32_tv 488 bytes stack frame, 988 bytes spill stores, 1248 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Sum_int32_tv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Sum_int32_tv 496 bytes stack frame, 360 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Sum_int32_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Sum_int32_tv 320 bytes stack frame, 460 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Sum_int32_tv 488 bytes stack frame, 328 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Sum_int32_tv 344 bytes stack frame, 592 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z40ncclKernel_AllReduce_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z40ncclKernel_AllReduce_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z40ncclKernel_AllReduce_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 176 bytes stack frame, 344 bytes spill stores, 540 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_int32_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_int32_tv 536 bytes stack frame, 1148 bytes spill stores, 1496 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Sum_int32_tv 288 bytes stack frame, 316 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Sum_int32_tv 552 bytes stack frame, 412 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Sum_int32_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Sum_int32_tv 352 bytes stack frame, 580 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Sum_int32_tv 552 bytes stack frame, 400 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Sum_int32_tv 368 bytes stack frame, 592 bytes spill stores, 840 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z40ncclKernel_AllReduce_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z40ncclKernel_AllReduce_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z40ncclKernel_AllReduce_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 176 bytes stack frame, 356 bytes spill stores, 552 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_int32_tv 288 bytes stack frame, 312 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_int32_tv 568 bytes stack frame, 1180 bytes spill stores, 1792 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Sum_int32_tv 400 bytes stack frame, 472 bytes spill stores, 1112 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Sum_int32_tv 560 bytes stack frame, 416 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Sum_int32_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Sum_int32_tv 488 bytes stack frame, 932 bytes spill stores, 1392 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Sum_int32_tv 560 bytes stack frame, 400 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Sum_int32_tv 376 bytes stack frame, 596 bytes spill stores, 864 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z40ncclKernel_AllReduce_NVLS_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z40ncclKernel_AllReduce_RING_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z40ncclKernel_AllReduce_TREE_LL_Sum_int32_tP11ncclDevCommmP8ncclWork 152 bytes stack frame, 292 bytes spill stores, 480 bytes spill loads ptxas info : Used 96 registers ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_int32_tv 440 bytes stack frame, 796 bytes spill stores, 1228 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Sum_int32_tv 400 bytes stack frame, 680 bytes spill stores, 936 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_int32_tv 240 bytes stack frame, 268 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_int32_tv 560 bytes stack frame, 1228 bytes spill stores, 1972 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Sum_int32_tv 304 bytes stack frame, 336 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Sum_int32_tv 504 bytes stack frame, 356 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Sum_int32_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Sum_int32_tv 416 bytes stack frame, 828 bytes spill stores, 1240 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Sum_int32_tv 480 bytes stack frame, 288 bytes spill stores, 272 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Sum_int32_tv 360 bytes stack frame, 580 bytes spill stores, 828 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_TREE_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_NVLS_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_halfv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_halfv 480 bytes stack frame, 964 bytes spill stores, 1268 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_SIMPLE_Prod_halfv 272 bytes stack frame, 296 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL128_Prod_halfv 496 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_RING_LL_Prod_halfv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_SIMPLE_Prod_halfv 400 bytes stack frame, 636 bytes spill stores, 1000 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL128_Prod_halfv 584 bytes stack frame, 468 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_TREE_LL_Prod_halfv 336 bytes stack frame, 564 bytes spill stores, 716 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_TREE_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_NVLS_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_halfv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_halfv 488 bytes stack frame, 1008 bytes spill stores, 1252 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_SIMPLE_Prod_halfv 224 bytes stack frame, 240 bytes spill stores, 232 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL128_Prod_halfv 536 bytes stack frame, 396 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_RING_LL_Prod_halfv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_SIMPLE_Prod_halfv 296 bytes stack frame, 404 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL128_Prod_halfv 504 bytes stack frame, 356 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_TREE_LL_Prod_halfv 328 bytes stack frame, 536 bytes spill stores, 656 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_TREE_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_NVLS_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_halfv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_halfv 488 bytes stack frame, 976 bytes spill stores, 1296 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_SIMPLE_Prod_halfv 272 bytes stack frame, 296 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL128_Prod_halfv 496 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_RING_LL_Prod_halfv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_SIMPLE_Prod_halfv 400 bytes stack frame, 636 bytes spill stores, 1000 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL128_Prod_halfv 584 bytes stack frame, 468 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_TREE_LL_Prod_halfv 336 bytes stack frame, 564 bytes spill stores, 716 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_TREE_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_NVLS_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_halfv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_halfv 528 bytes stack frame, 1036 bytes spill stores, 1416 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_SIMPLE_Prod_halfv 280 bytes stack frame, 308 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL128_Prod_halfv 576 bytes stack frame, 484 bytes spill stores, 620 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_RING_LL_Prod_halfv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_SIMPLE_Prod_halfv 312 bytes stack frame, 456 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL128_Prod_halfv 560 bytes stack frame, 408 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_TREE_LL_Prod_halfv 352 bytes stack frame, 560 bytes spill stores, 752 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_TREE_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_NVLS_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_halfv 256 bytes stack frame, 276 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_halfv 576 bytes stack frame, 1284 bytes spill stores, 2044 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_SIMPLE_Prod_halfv 400 bytes stack frame, 476 bytes spill stores, 1012 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL128_Prod_halfv 560 bytes stack frame, 416 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_RING_LL_Prod_halfv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_SIMPLE_Prod_halfv 504 bytes stack frame, 1008 bytes spill stores, 1580 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL128_Prod_halfv 560 bytes stack frame, 408 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_TREE_LL_Prod_halfv 360 bytes stack frame, 572 bytes spill stores, 788 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_TREE_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_SIMPLE_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_NVLS_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_halfv 240 bytes stack frame, 264 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_halfv 568 bytes stack frame, 1320 bytes spill stores, 2000 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_SIMPLE_Prod_halfv 320 bytes stack frame, 352 bytes spill stores, 612 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL128_Prod_halfv 504 bytes stack frame, 360 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_RING_LL_Prod_halfv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_SIMPLE_Prod_halfv 472 bytes stack frame, 888 bytes spill stores, 1296 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL128_Prod_halfv 480 bytes stack frame, 288 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_TREE_LL_Prod_halfv 352 bytes stack frame, 540 bytes spill stores, 748 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_int64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Min_int64_tv 232 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Min_int64_tv 512 bytes stack frame, 372 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Min_int64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_int64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Min_int64_tv 232 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Min_int64_tv 512 bytes stack frame, 372 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Min_int64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_int64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Min_int64_tv 232 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Min_int64_tv 512 bytes stack frame, 372 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Min_int64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_int64_tv 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Min_int64_tv 304 bytes stack frame, 332 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Min_int64_tv 560 bytes stack frame, 436 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Min_int64_tv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_int64_tv 336 bytes stack frame, 348 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Min_int64_tv 400 bytes stack frame, 440 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Min_int64_tv 568 bytes stack frame, 440 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Min_int64_tv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_int64_tv 288 bytes stack frame, 300 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Min_int64_tv 320 bytes stack frame, 344 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Min_int64_tv 536 bytes stack frame, 416 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Min_int64_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Min_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_uint64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Min_uint64_tv 232 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Min_uint64_tv 512 bytes stack frame, 372 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Min_uint64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_uint64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Min_uint64_tv 232 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Min_uint64_tv 512 bytes stack frame, 372 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Min_uint64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_uint64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Min_uint64_tv 232 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Min_uint64_tv 512 bytes stack frame, 372 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Min_uint64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_uint64_tv 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Min_uint64_tv 304 bytes stack frame, 332 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Min_uint64_tv 560 bytes stack frame, 436 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Min_uint64_tv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_uint64_tv 336 bytes stack frame, 348 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Min_uint64_tv 400 bytes stack frame, 440 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Min_uint64_tv 568 bytes stack frame, 440 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Min_uint64_tv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_uint64_tv 288 bytes stack frame, 300 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Min_uint64_tv 320 bytes stack frame, 344 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Min_uint64_tv 536 bytes stack frame, 416 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Min_uint64_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Min_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z43ncclKernel_AllReduce_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z43ncclKernel_AllReduce_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_AllReduce_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z38ncclKernel_AllReduce_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z47ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z48ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_AllReduce_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z38ncclKernel_AllReduce_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z38ncclKernel_AllReduce_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z38ncclKernel_AllReduce_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 160 bytes stack frame, 316 bytes spill stores, 500 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_TREE_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_NVLS_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_floatv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_floatv 496 bytes stack frame, 1004 bytes spill stores, 1320 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_SIMPLE_Sum_floatv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL128_Sum_floatv 496 bytes stack frame, 360 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_RING_LL_Sum_floatv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_SIMPLE_Sum_floatv 336 bytes stack frame, 488 bytes spill stores, 688 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL128_Sum_floatv 504 bytes stack frame, 348 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_TREE_LL_Sum_floatv 344 bytes stack frame, 592 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z43ncclKernel_AllReduce_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z43ncclKernel_AllReduce_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_AllReduce_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z38ncclKernel_AllReduce_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z47ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z48ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_AllReduce_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z38ncclKernel_AllReduce_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z38ncclKernel_AllReduce_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z38ncclKernel_AllReduce_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 160 bytes stack frame, 316 bytes spill stores, 500 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_TREE_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_NVLS_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_floatv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_floatv 496 bytes stack frame, 1004 bytes spill stores, 1320 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_SIMPLE_Sum_floatv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL128_Sum_floatv 496 bytes stack frame, 360 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_RING_LL_Sum_floatv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_SIMPLE_Sum_floatv 336 bytes stack frame, 488 bytes spill stores, 688 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL128_Sum_floatv 504 bytes stack frame, 348 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_TREE_LL_Sum_floatv 344 bytes stack frame, 592 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z43ncclKernel_AllReduce_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z43ncclKernel_AllReduce_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_AllReduce_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z38ncclKernel_AllReduce_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z47ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z48ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_AllReduce_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z38ncclKernel_AllReduce_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z38ncclKernel_AllReduce_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z38ncclKernel_AllReduce_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 160 bytes stack frame, 316 bytes spill stores, 500 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_TREE_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_NVLS_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_floatv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_floatv 496 bytes stack frame, 1004 bytes spill stores, 1320 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_SIMPLE_Sum_floatv 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL128_Sum_floatv 496 bytes stack frame, 360 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_RING_LL_Sum_floatv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_SIMPLE_Sum_floatv 336 bytes stack frame, 488 bytes spill stores, 688 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL128_Sum_floatv 504 bytes stack frame, 348 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_TREE_LL_Sum_floatv 344 bytes stack frame, 592 bytes spill stores, 740 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z43ncclKernel_AllReduce_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z43ncclKernel_AllReduce_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_AllReduce_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z38ncclKernel_AllReduce_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z47ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z48ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_AllReduce_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z38ncclKernel_AllReduce_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_AllReduce_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z38ncclKernel_AllReduce_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 176 bytes stack frame, 344 bytes spill stores, 540 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_TREE_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_NVLS_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_floatv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_floatv 528 bytes stack frame, 1048 bytes spill stores, 1412 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_SIMPLE_Sum_floatv 288 bytes stack frame, 316 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL128_Sum_floatv 552 bytes stack frame, 412 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_RING_LL_Sum_floatv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_SIMPLE_Sum_floatv 344 bytes stack frame, 496 bytes spill stores, 668 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL128_Sum_floatv 552 bytes stack frame, 400 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_TREE_LL_Sum_floatv 368 bytes stack frame, 592 bytes spill stores, 832 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z43ncclKernel_AllReduce_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z43ncclKernel_AllReduce_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_AllReduce_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z38ncclKernel_AllReduce_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z47ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z48ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_AllReduce_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z38ncclKernel_AllReduce_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z38ncclKernel_AllReduce_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z38ncclKernel_AllReduce_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 176 bytes stack frame, 356 bytes spill stores, 552 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_TREE_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_SIMPLE_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_NVLS_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_floatv 288 bytes stack frame, 312 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_floatv 568 bytes stack frame, 1204 bytes spill stores, 1812 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_SIMPLE_Sum_floatv 400 bytes stack frame, 468 bytes spill stores, 1164 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL128_Sum_floatv 560 bytes stack frame, 432 bytes spill stores, 532 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_RING_LL_Sum_floatv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_SIMPLE_Sum_floatv 480 bytes stack frame, 948 bytes spill stores, 1412 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL128_Sum_floatv 560 bytes stack frame, 400 bytes spill stores, 504 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_TREE_LL_Sum_floatv 376 bytes stack frame, 604 bytes spill stores, 856 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z43ncclKernel_AllReduce_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z43ncclKernel_AllReduce_NVLS_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z38ncclKernel_AllReduce_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z38ncclKernel_AllReduce_NVLS_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z47ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z47ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z48ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z48ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_floatP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z38ncclKernel_AllReduce_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z38ncclKernel_AllReduce_RING_LL_Sum_floatP11ncclDevCommmP8ncclWork 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_Z38ncclKernel_AllReduce_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z38ncclKernel_AllReduce_TREE_LL_Sum_floatP11ncclDevCommmP8ncclWork 152 bytes stack frame, 292 bytes spill stores, 480 bytes spill loads ptxas info : Used 96 registers ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_floatv 504 bytes stack frame, 916 bytes spill stores, 1396 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_TREE_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_SIMPLE_Sum_floatv 400 bytes stack frame, 664 bytes spill stores, 920 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_NVLS_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_floatv 240 bytes stack frame, 268 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_floatv 560 bytes stack frame, 1248 bytes spill stores, 1988 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_SIMPLE_Sum_floatv 304 bytes stack frame, 352 bytes spill stores, 640 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL128_Sum_floatv 504 bytes stack frame, 356 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_RING_LL_Sum_floatv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_SIMPLE_Sum_floatv 416 bytes stack frame, 828 bytes spill stores, 1196 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL128_Sum_floatv 488 bytes stack frame, 308 bytes spill stores, 276 bytes spill loads ptxas info : Function properties for _Z40ncclFunction_AllReduce_TREE_LL_Sum_floatv 360 bytes stack frame, 572 bytes spill stores, 820 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_doublev 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Min_doublev 216 bytes stack frame, 232 bytes spill stores, 224 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Min_doublev 520 bytes stack frame, 380 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Min_doublev 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_doublev 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Min_doublev 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Min_doublev 520 bytes stack frame, 380 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Min_doublev 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_doublev 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Min_doublev 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Min_doublev 520 bytes stack frame, 380 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Min_doublev 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_doublev 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Min_doublev 304 bytes stack frame, 328 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Min_doublev 560 bytes stack frame, 436 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Min_doublev 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_doublev 336 bytes stack frame, 348 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Min_doublev 408 bytes stack frame, 448 bytes spill stores, 796 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Min_doublev 552 bytes stack frame, 428 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Min_doublev 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_doublev 288 bytes stack frame, 300 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Min_doublev 320 bytes stack frame, 360 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Min_doublev 512 bytes stack frame, 368 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Min_doublev 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Min_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_int64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Max_int64_tv 232 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Max_int64_tv 512 bytes stack frame, 372 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Max_int64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_int64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Max_int64_tv 232 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Max_int64_tv 512 bytes stack frame, 372 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Max_int64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_int64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Max_int64_tv 232 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Max_int64_tv 512 bytes stack frame, 372 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Max_int64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_int64_tv 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Max_int64_tv 304 bytes stack frame, 332 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Max_int64_tv 560 bytes stack frame, 436 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Max_int64_tv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_int64_tv 336 bytes stack frame, 348 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Max_int64_tv 400 bytes stack frame, 440 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Max_int64_tv 568 bytes stack frame, 440 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Max_int64_tv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_int64_tv 288 bytes stack frame, 300 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Max_int64_tv 320 bytes stack frame, 344 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Max_int64_tv 536 bytes stack frame, 416 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Max_int64_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Max_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_uint64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Max_uint64_tv 232 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Max_uint64_tv 512 bytes stack frame, 372 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Max_uint64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_uint64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Max_uint64_tv 232 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Max_uint64_tv 512 bytes stack frame, 372 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Max_uint64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_uint64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Max_uint64_tv 232 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Max_uint64_tv 512 bytes stack frame, 372 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Max_uint64_tv 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_uint64_tv 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Max_uint64_tv 304 bytes stack frame, 332 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Max_uint64_tv 560 bytes stack frame, 436 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Max_uint64_tv 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_uint64_tv 336 bytes stack frame, 348 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Max_uint64_tv 400 bytes stack frame, 440 bytes spill stores, 776 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Max_uint64_tv 568 bytes stack frame, 440 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Max_uint64_tv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_uint64_tv 288 bytes stack frame, 300 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Max_uint64_tv 320 bytes stack frame, 344 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Max_uint64_tv 536 bytes stack frame, 416 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Max_uint64_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Max_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_floatv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_NVLS_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_SIMPLE_Min_floatv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL128_Min_floatv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_RING_LL_Min_floatv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_TREE_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_floatv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_NVLS_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_SIMPLE_Min_floatv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL128_Min_floatv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_RING_LL_Min_floatv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_TREE_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_floatv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_NVLS_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_SIMPLE_Min_floatv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL128_Min_floatv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_RING_LL_Min_floatv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_TREE_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_floatv 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_NVLS_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_SIMPLE_Min_floatv 304 bytes stack frame, 332 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL128_Min_floatv 552 bytes stack frame, 420 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_RING_LL_Min_floatv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_TREE_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_floatv 336 bytes stack frame, 348 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_NVLS_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_SIMPLE_Min_floatv 408 bytes stack frame, 444 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL128_Min_floatv 560 bytes stack frame, 432 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_RING_LL_Min_floatv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_TREE_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_floatv 288 bytes stack frame, 300 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_NVLS_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_SIMPLE_Min_floatv 312 bytes stack frame, 340 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL128_Min_floatv 528 bytes stack frame, 400 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_RING_LL_Min_floatv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_SIMPLE_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL128_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_TREE_LL_Min_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_int32_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Max_int32_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Max_int32_tv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Max_int32_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_int32_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Max_int32_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Max_int32_tv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Max_int32_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_int32_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Max_int32_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Max_int32_tv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Max_int32_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_int32_tv 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Max_int32_tv 304 bytes stack frame, 332 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Max_int32_tv 552 bytes stack frame, 420 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Max_int32_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_int32_tv 336 bytes stack frame, 348 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Max_int32_tv 408 bytes stack frame, 444 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Max_int32_tv 560 bytes stack frame, 432 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Max_int32_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_int32_tv 288 bytes stack frame, 300 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Max_int32_tv 312 bytes stack frame, 340 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Max_int32_tv 528 bytes stack frame, 400 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Max_int32_tv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Max_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_halfv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_NVLS_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_SIMPLE_Min_halfv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL128_Min_halfv 512 bytes stack frame, 380 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_RING_LL_Min_halfv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_TREE_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_halfv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_NVLS_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_SIMPLE_Min_halfv 240 bytes stack frame, 264 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL128_Min_halfv 512 bytes stack frame, 380 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_RING_LL_Min_halfv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_TREE_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_halfv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_NVLS_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_SIMPLE_Min_halfv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL128_Min_halfv 512 bytes stack frame, 380 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_RING_LL_Min_halfv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_TREE_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_halfv 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_NVLS_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_SIMPLE_Min_halfv 312 bytes stack frame, 340 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL128_Min_halfv 568 bytes stack frame, 444 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_RING_LL_Min_halfv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_TREE_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_halfv 336 bytes stack frame, 348 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_NVLS_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_SIMPLE_Min_halfv 384 bytes stack frame, 440 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL128_Min_halfv 568 bytes stack frame, 432 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_RING_LL_Min_halfv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_TREE_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_TREE_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_SIMPLE_Min_halfv 272 bytes stack frame, 288 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_NVLS_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_SIMPLE_Min_halfv 280 bytes stack frame, 304 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL128_Min_halfv 536 bytes stack frame, 400 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_RING_LL_Min_halfv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_TREE_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_uint32_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Max_uint32_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Max_uint32_tv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Max_uint32_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_uint32_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Max_uint32_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Max_uint32_tv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Max_uint32_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_uint32_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Max_uint32_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Max_uint32_tv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Max_uint32_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_uint32_tv 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Max_uint32_tv 304 bytes stack frame, 332 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Max_uint32_tv 552 bytes stack frame, 420 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Max_uint32_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_uint32_tv 336 bytes stack frame, 348 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Max_uint32_tv 408 bytes stack frame, 444 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Max_uint32_tv 560 bytes stack frame, 432 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Max_uint32_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_uint32_tv 288 bytes stack frame, 300 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_SIMPLE_Max_uint32_tv 312 bytes stack frame, 340 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL128_Max_uint32_tv 528 bytes stack frame, 400 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL_Max_uint32_tv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_SIMPLE_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL128_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL_Max_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_doublev 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Max_doublev 216 bytes stack frame, 232 bytes spill stores, 224 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Max_doublev 520 bytes stack frame, 380 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Max_doublev 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_doublev 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Max_doublev 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Max_doublev 520 bytes stack frame, 380 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Max_doublev 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_doublev 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Max_doublev 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Max_doublev 520 bytes stack frame, 380 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Max_doublev 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_doublev 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Max_doublev 304 bytes stack frame, 328 bytes spill stores, 352 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Max_doublev 560 bytes stack frame, 436 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Max_doublev 168 bytes stack frame, 164 bytes spill stores, 164 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_doublev 336 bytes stack frame, 348 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Max_doublev 408 bytes stack frame, 448 bytes spill stores, 796 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Max_doublev 552 bytes stack frame, 428 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Max_doublev 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_doublev 288 bytes stack frame, 300 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Max_doublev 320 bytes stack frame, 360 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Max_doublev 512 bytes stack frame, 368 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Max_doublev 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Max_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_floatv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_NVLS_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_SIMPLE_Max_floatv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL128_Max_floatv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_RING_LL_Max_floatv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_TREE_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_floatv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_NVLS_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_SIMPLE_Max_floatv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL128_Max_floatv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_RING_LL_Max_floatv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_TREE_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_floatv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_NVLS_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_SIMPLE_Max_floatv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL128_Max_floatv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_RING_LL_Max_floatv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_TREE_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_floatv 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_NVLS_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_SIMPLE_Max_floatv 304 bytes stack frame, 332 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL128_Max_floatv 552 bytes stack frame, 420 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_RING_LL_Max_floatv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_TREE_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_floatv 336 bytes stack frame, 348 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_NVLS_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_SIMPLE_Max_floatv 408 bytes stack frame, 444 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL128_Max_floatv 560 bytes stack frame, 432 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_RING_LL_Max_floatv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_TREE_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_floatv 288 bytes stack frame, 300 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_NVLS_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_SIMPLE_Max_floatv 312 bytes stack frame, 340 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_LL128_Max_floatv 528 bytes stack frame, 400 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_RING_LL_Max_floatv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_SIMPLE_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_LL128_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_ReduceScatter_TREE_LL_Max_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_halfv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_NVLS_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_SIMPLE_Max_halfv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL128_Max_halfv 512 bytes stack frame, 380 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_RING_LL_Max_halfv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_TREE_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_halfv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_NVLS_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_SIMPLE_Max_halfv 240 bytes stack frame, 264 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL128_Max_halfv 512 bytes stack frame, 380 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_RING_LL_Max_halfv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_TREE_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_halfv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_NVLS_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_SIMPLE_Max_halfv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL128_Max_halfv 512 bytes stack frame, 380 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_RING_LL_Max_halfv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_TREE_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_halfv 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_NVLS_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_SIMPLE_Max_halfv 312 bytes stack frame, 340 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL128_Max_halfv 568 bytes stack frame, 444 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_RING_LL_Max_halfv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_TREE_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_halfv 336 bytes stack frame, 348 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_NVLS_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_SIMPLE_Max_halfv 384 bytes stack frame, 440 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL128_Max_halfv 568 bytes stack frame, 432 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_RING_LL_Max_halfv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_TREE_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_halfv 272 bytes stack frame, 288 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_NVLS_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_RING_SIMPLE_Max_halfv 280 bytes stack frame, 304 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL128_Max_halfv 536 bytes stack frame, 400 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_RING_LL_Max_halfv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_ReduceScatter_TREE_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_ReduceScatter_TREE_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling onerank_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/onerank_reduce.o mkdir -p `dirname /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/onerank_reduce.o` /usr/local/cuda/bin/nvcc -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc onerank_reduce.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/onerank_reduce.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z43ncclFunction_OneRankReduce_PreMulSum_doublev 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_OneRankReduce_PreMulSum_floatv 64 bytes stack frame, 60 bytes spill stores, 60 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_OneRankReduce_PreMulSum___nv_bfloat16v 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_OneRankReduce_PreMulSum_halfv 64 bytes stack frame, 60 bytes spill stores, 60 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_OneRankReduce_PreMulSum_uint64_tv 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_OneRankReduce_PreMulSum_int64_tv 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_OneRankReduce_PreMulSum_uint32_tv 64 bytes stack frame, 60 bytes spill stores, 60 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_OneRankReduce_PreMulSum_int32_tv 64 bytes stack frame, 60 bytes spill stores, 60 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_OneRankReduce_PreMulSum_uint8_tv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_OneRankReduce_PreMulSum_int8_tv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z43ncclFunction_OneRankReduce_PreMulSum_doublev 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_OneRankReduce_PreMulSum_floatv 64 bytes stack frame, 60 bytes spill stores, 60 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_OneRankReduce_PreMulSum___nv_bfloat16v 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_OneRankReduce_PreMulSum_halfv 64 bytes stack frame, 60 bytes spill stores, 60 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_OneRankReduce_PreMulSum_uint64_tv 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_OneRankReduce_PreMulSum_int64_tv 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_OneRankReduce_PreMulSum_uint32_tv 64 bytes stack frame, 60 bytes spill stores, 60 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_OneRankReduce_PreMulSum_int32_tv 64 bytes stack frame, 60 bytes spill stores, 60 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_OneRankReduce_PreMulSum_uint8_tv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_OneRankReduce_PreMulSum_int8_tv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z43ncclFunction_OneRankReduce_PreMulSum_doublev 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_OneRankReduce_PreMulSum_floatv 64 bytes stack frame, 60 bytes spill stores, 60 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_OneRankReduce_PreMulSum___nv_bfloat16v 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_OneRankReduce_PreMulSum_halfv 64 bytes stack frame, 60 bytes spill stores, 60 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_OneRankReduce_PreMulSum_uint64_tv 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_OneRankReduce_PreMulSum_int64_tv 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_OneRankReduce_PreMulSum_uint32_tv 64 bytes stack frame, 60 bytes spill stores, 60 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_OneRankReduce_PreMulSum_int32_tv 64 bytes stack frame, 60 bytes spill stores, 60 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_OneRankReduce_PreMulSum_uint8_tv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_OneRankReduce_PreMulSum_int8_tv 136 bytes stack frame, 136 bytes spill stores, 136 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z43ncclFunction_OneRankReduce_PreMulSum_doublev 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_OneRankReduce_PreMulSum_floatv 72 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_OneRankReduce_PreMulSum___nv_bfloat16v 80 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_OneRankReduce_PreMulSum_halfv 72 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_OneRankReduce_PreMulSum_uint64_tv 72 bytes stack frame, 72 bytes spill stores, 72 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_OneRankReduce_PreMulSum_int64_tv 72 bytes stack frame, 72 bytes spill stores, 72 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_OneRankReduce_PreMulSum_uint32_tv 72 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_OneRankReduce_PreMulSum_int32_tv 72 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_OneRankReduce_PreMulSum_uint8_tv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_OneRankReduce_PreMulSum_int8_tv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z43ncclFunction_OneRankReduce_PreMulSum_doublev 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_OneRankReduce_PreMulSum_floatv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_OneRankReduce_PreMulSum___nv_bfloat16v 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_OneRankReduce_PreMulSum_halfv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_OneRankReduce_PreMulSum_uint64_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_OneRankReduce_PreMulSum_int64_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_OneRankReduce_PreMulSum_uint32_tv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_OneRankReduce_PreMulSum_int32_tv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_OneRankReduce_PreMulSum_uint8_tv 224 bytes stack frame, 224 bytes spill stores, 220 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_OneRankReduce_PreMulSum_int8_tv 224 bytes stack frame, 224 bytes spill stores, 220 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z43ncclFunction_OneRankReduce_PreMulSum_doublev 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_OneRankReduce_PreMulSum_floatv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_OneRankReduce_PreMulSum___nv_bfloat16v 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_OneRankReduce_PreMulSum_halfv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_OneRankReduce_PreMulSum_uint64_tv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_OneRankReduce_PreMulSum_int64_tv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_OneRankReduce_PreMulSum_uint32_tv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_OneRankReduce_PreMulSum_int32_tv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_OneRankReduce_PreMulSum_uint8_tv 208 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_OneRankReduce_PreMulSum_int8_tv 208 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_uint8_tv 272 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Max_uint8_tv 272 bytes stack frame, 300 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Max_uint8_tv 504 bytes stack frame, 348 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Max_uint8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_uint8_tv 272 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Max_uint8_tv 272 bytes stack frame, 300 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Max_uint8_tv 504 bytes stack frame, 348 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Max_uint8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_uint8_tv 272 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Max_uint8_tv 272 bytes stack frame, 300 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Max_uint8_tv 504 bytes stack frame, 348 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Max_uint8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_uint8_tv 288 bytes stack frame, 308 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Max_uint8_tv 392 bytes stack frame, 436 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Max_uint8_tv 568 bytes stack frame, 460 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Max_uint8_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_uint8_tv 424 bytes stack frame, 492 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Max_uint8_tv 480 bytes stack frame, 668 bytes spill stores, 1248 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Max_uint8_tv 568 bytes stack frame, 440 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Max_uint8_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_uint8_tv 416 bytes stack frame, 556 bytes spill stores, 956 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_NVLS_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_SIMPLE_Max_uint8_tv 416 bytes stack frame, 528 bytes spill stores, 832 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL128_Max_uint8_tv 544 bytes stack frame, 400 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_RING_LL_Max_uint8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_ReduceScatter_TREE_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_int8_tv 272 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Max_int8_tv 272 bytes stack frame, 300 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Max_int8_tv 504 bytes stack frame, 348 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Max_int8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_int8_tv 272 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Max_int8_tv 272 bytes stack frame, 300 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Max_int8_tv 504 bytes stack frame, 348 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Max_int8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_int8_tv 272 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Max_int8_tv 272 bytes stack frame, 300 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Max_int8_tv 504 bytes stack frame, 348 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Max_int8_tv 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_int8_tv 288 bytes stack frame, 308 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Max_int8_tv 392 bytes stack frame, 436 bytes spill stores, 792 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Max_int8_tv 568 bytes stack frame, 460 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Max_int8_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_int8_tv 424 bytes stack frame, 492 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Max_int8_tv 488 bytes stack frame, 676 bytes spill stores, 1256 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Max_int8_tv 568 bytes stack frame, 440 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Max_int8_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_TREE_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_SIMPLE_Max_int8_tv 416 bytes stack frame, 556 bytes spill stores, 956 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_NVLS_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_NVLS_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_SIMPLE_Max_int8_tv 408 bytes stack frame, 532 bytes spill stores, 828 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_RING_LL128_Max_int8_tv 536 bytes stack frame, 392 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_RING_LL_Max_int8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_ReduceScatter_TREE_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_ReduceScatter_TREE_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_Min___nv_bfloat16v 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_Min___nv_bfloat16v 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_Min___nv_bfloat16v 520 bytes stack frame, 396 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_Min___nv_bfloat16v 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_Min___nv_bfloat16v 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_Min___nv_bfloat16v 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_Min___nv_bfloat16v 520 bytes stack frame, 396 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_Min___nv_bfloat16v 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_Min___nv_bfloat16v 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_Min___nv_bfloat16v 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_Min___nv_bfloat16v 520 bytes stack frame, 396 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_Min___nv_bfloat16v 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_Min___nv_bfloat16v 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_Min___nv_bfloat16v 264 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_Min___nv_bfloat16v 560 bytes stack frame, 428 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_Min___nv_bfloat16v 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_Min___nv_bfloat16v 336 bytes stack frame, 348 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_Min___nv_bfloat16v 384 bytes stack frame, 440 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_Min___nv_bfloat16v 568 bytes stack frame, 432 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_Min___nv_bfloat16v 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_Min___nv_bfloat16v 272 bytes stack frame, 288 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_Min___nv_bfloat16v 280 bytes stack frame, 304 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_Min___nv_bfloat16v 536 bytes stack frame, 400 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_Min___nv_bfloat16v 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem ptxas info : 0 bytes gmem make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling sendrecv.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclKernel_SendRecv_RING_SIMPLE_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z42ncclKernel_SendRecv_RING_SIMPLE_Sum_int8_tP11ncclDevCommmP8ncclWork 136 bytes stack frame, 152 bytes spill stores, 212 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z44ncclFunction_SendRecv_RING_SIMPLE_Sum_int8_tv 200 bytes stack frame, 200 bytes spill stores, 204 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclKernel_SendRecv_RING_SIMPLE_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z42ncclKernel_SendRecv_RING_SIMPLE_Sum_int8_tP11ncclDevCommmP8ncclWork 136 bytes stack frame, 152 bytes spill stores, 212 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z44ncclFunction_SendRecv_RING_SIMPLE_Sum_int8_tv 200 bytes stack frame, 200 bytes spill stores, 204 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclKernel_SendRecv_RING_SIMPLE_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z42ncclKernel_SendRecv_RING_SIMPLE_Sum_int8_tP11ncclDevCommmP8ncclWork 136 bytes stack frame, 152 bytes spill stores, 212 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z44ncclFunction_SendRecv_RING_SIMPLE_Sum_int8_tv 200 bytes stack frame, 200 bytes spill stores, 204 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclKernel_SendRecv_RING_SIMPLE_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z42ncclKernel_SendRecv_RING_SIMPLE_Sum_int8_tP11ncclDevCommmP8ncclWork 120 bytes stack frame, 144 bytes spill stores, 208 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z44ncclFunction_SendRecv_RING_SIMPLE_Sum_int8_tv 224 bytes stack frame, 228 bytes spill stores, 236 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclKernel_SendRecv_RING_SIMPLE_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z42ncclKernel_SendRecv_RING_SIMPLE_Sum_int8_tP11ncclDevCommmP8ncclWork 256 bytes stack frame, 580 bytes spill stores, 952 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z44ncclFunction_SendRecv_RING_SIMPLE_Sum_int8_tv 352 bytes stack frame, 604 bytes spill stores, 868 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Compiling entry function '_Z42ncclKernel_SendRecv_RING_SIMPLE_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z42ncclKernel_SendRecv_RING_SIMPLE_Sum_int8_tP11ncclDevCommmP8ncclWork 168 bytes stack frame, 504 bytes spill stores, 804 bytes spill loads ptxas info : Used 96 registers ptxas info : Function properties for _Z44ncclFunction_SendRecv_RING_SIMPLE_Sum_int8_tv 280 bytes stack frame, 416 bytes spill stores, 576 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_TREE_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_NVLS_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_halfv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_halfv 488 bytes stack frame, 984 bytes spill stores, 1304 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_SIMPLE_Min_halfv 272 bytes stack frame, 308 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL128_Min_halfv 496 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_RING_LL_Min_halfv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_SIMPLE_Min_halfv 408 bytes stack frame, 636 bytes spill stores, 992 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL128_Min_halfv 584 bytes stack frame, 468 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_TREE_LL_Min_halfv 336 bytes stack frame, 564 bytes spill stores, 716 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_TREE_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_NVLS_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_halfv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_halfv 480 bytes stack frame, 940 bytes spill stores, 1272 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_SIMPLE_Min_halfv 256 bytes stack frame, 280 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL128_Min_halfv 496 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_RING_LL_Min_halfv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_SIMPLE_Min_halfv 408 bytes stack frame, 636 bytes spill stores, 992 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL128_Min_halfv 584 bytes stack frame, 468 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_TREE_LL_Min_halfv 336 bytes stack frame, 564 bytes spill stores, 716 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_TREE_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_NVLS_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_halfv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_halfv 488 bytes stack frame, 984 bytes spill stores, 1304 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_SIMPLE_Min_halfv 272 bytes stack frame, 308 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL128_Min_halfv 496 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_RING_LL_Min_halfv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_SIMPLE_Min_halfv 408 bytes stack frame, 636 bytes spill stores, 992 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL128_Min_halfv 584 bytes stack frame, 468 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_TREE_LL_Min_halfv 336 bytes stack frame, 564 bytes spill stores, 716 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_TREE_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_NVLS_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_halfv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_halfv 520 bytes stack frame, 1020 bytes spill stores, 1404 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_SIMPLE_Min_halfv 312 bytes stack frame, 340 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL128_Min_halfv 544 bytes stack frame, 424 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_RING_LL_Min_halfv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_SIMPLE_Min_halfv 408 bytes stack frame, 660 bytes spill stores, 1036 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL128_Min_halfv 640 bytes stack frame, 528 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_TREE_LL_Min_halfv 360 bytes stack frame, 584 bytes spill stores, 804 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_TREE_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_SIMPLE_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_NVLS_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_halfv 256 bytes stack frame, 276 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_halfv 576 bytes stack frame, 1284 bytes spill stores, 2044 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_SIMPLE_Min_halfv 400 bytes stack frame, 476 bytes spill stores, 1012 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL128_Min_halfv 560 bytes stack frame, 416 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_RING_LL_Min_halfv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_SIMPLE_Min_halfv 504 bytes stack frame, 1008 bytes spill stores, 1580 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL128_Min_halfv 560 bytes stack frame, 408 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_TREE_LL_Min_halfv 360 bytes stack frame, 572 bytes spill stores, 788 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_halfv 520 bytes stack frame, 952 bytes spill stores, 1388 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_TREE_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_SIMPLE_Min_halfv 416 bytes stack frame, 716 bytes spill stores, 1100 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_NVLS_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_halfv 240 bytes stack frame, 264 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_halfv 568 bytes stack frame, 1320 bytes spill stores, 2000 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_SIMPLE_Min_halfv 320 bytes stack frame, 352 bytes spill stores, 612 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL128_Min_halfv 504 bytes stack frame, 360 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_RING_LL_Min_halfv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_SIMPLE_Min_halfv 472 bytes stack frame, 888 bytes spill stores, 1296 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL128_Min_halfv 480 bytes stack frame, 288 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_TREE_LL_Min_halfv 352 bytes stack frame, 540 bytes spill stores, 748 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_uint8_tv 216 bytes stack frame, 236 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_uint8_tv 512 bytes stack frame, 1156 bytes spill stores, 1648 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Prod_uint8_tv 288 bytes stack frame, 312 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Prod_uint8_tv 464 bytes stack frame, 312 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Prod_uint8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Prod_uint8_tv 488 bytes stack frame, 904 bytes spill stores, 1320 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Prod_uint8_tv 816 bytes stack frame, 416 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Prod_uint8_tv 344 bytes stack frame, 580 bytes spill stores, 744 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_uint8_tv 216 bytes stack frame, 236 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_uint8_tv 512 bytes stack frame, 1156 bytes spill stores, 1648 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Prod_uint8_tv 288 bytes stack frame, 312 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Prod_uint8_tv 464 bytes stack frame, 312 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Prod_uint8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Prod_uint8_tv 488 bytes stack frame, 904 bytes spill stores, 1320 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Prod_uint8_tv 816 bytes stack frame, 416 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Prod_uint8_tv 344 bytes stack frame, 580 bytes spill stores, 744 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_uint8_tv 216 bytes stack frame, 236 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_uint8_tv 512 bytes stack frame, 1156 bytes spill stores, 1648 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Prod_uint8_tv 288 bytes stack frame, 312 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Prod_uint8_tv 464 bytes stack frame, 312 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Prod_uint8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Prod_uint8_tv 488 bytes stack frame, 904 bytes spill stores, 1320 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Prod_uint8_tv 816 bytes stack frame, 416 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Prod_uint8_tv 344 bytes stack frame, 580 bytes spill stores, 744 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_uint8_tv 232 bytes stack frame, 252 bytes spill stores, 244 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_uint8_tv 552 bytes stack frame, 1180 bytes spill stores, 1692 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Prod_uint8_tv 336 bytes stack frame, 384 bytes spill stores, 632 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Prod_uint8_tv 544 bytes stack frame, 416 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Prod_uint8_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Prod_uint8_tv 536 bytes stack frame, 972 bytes spill stores, 1452 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Prod_uint8_tv 856 bytes stack frame, 476 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Prod_uint8_tv 376 bytes stack frame, 588 bytes spill stores, 848 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_uint8_tv 376 bytes stack frame, 640 bytes spill stores, 1324 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_uint8_tv 688 bytes stack frame, 1840 bytes spill stores, 3188 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Prod_uint8_tv 536 bytes stack frame, 852 bytes spill stores, 1828 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Prod_uint8_tv 544 bytes stack frame, 396 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Prod_uint8_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Prod_uint8_tv 1056 bytes stack frame, 2552 bytes spill stores, 3260 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Prod_uint8_tv 856 bytes stack frame, 480 bytes spill stores, 664 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Prod_uint8_tv 384 bytes stack frame, 592 bytes spill stores, 840 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_SIMPLE_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_uint8_tv 360 bytes stack frame, 632 bytes spill stores, 1468 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_uint8_tv 672 bytes stack frame, 2024 bytes spill stores, 3816 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_SIMPLE_Prod_uint8_tv 416 bytes stack frame, 644 bytes spill stores, 1348 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_LL128_Prod_uint8_tv 496 bytes stack frame, 344 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_LL_Prod_uint8_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_SIMPLE_Prod_uint8_tv 1032 bytes stack frame, 2644 bytes spill stores, 3372 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_LL128_Prod_uint8_tv 792 bytes stack frame, 384 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_LL_Prod_uint8_tv 368 bytes stack frame, 560 bytes spill stores, 868 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_TREE_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_NVLS_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_halfv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_halfv 488 bytes stack frame, 984 bytes spill stores, 1304 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_SIMPLE_Max_halfv 272 bytes stack frame, 308 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL128_Max_halfv 496 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_RING_LL_Max_halfv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_SIMPLE_Max_halfv 408 bytes stack frame, 636 bytes spill stores, 992 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL128_Max_halfv 584 bytes stack frame, 468 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_TREE_LL_Max_halfv 336 bytes stack frame, 564 bytes spill stores, 716 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_TREE_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_NVLS_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_halfv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_halfv 480 bytes stack frame, 940 bytes spill stores, 1272 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_SIMPLE_Max_halfv 256 bytes stack frame, 280 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL128_Max_halfv 496 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_RING_LL_Max_halfv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_SIMPLE_Max_halfv 408 bytes stack frame, 636 bytes spill stores, 992 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL128_Max_halfv 584 bytes stack frame, 468 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_TREE_LL_Max_halfv 336 bytes stack frame, 564 bytes spill stores, 716 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_TREE_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_NVLS_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_halfv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_halfv 488 bytes stack frame, 984 bytes spill stores, 1304 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_SIMPLE_Max_halfv 272 bytes stack frame, 308 bytes spill stores, 424 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL128_Max_halfv 496 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_RING_LL_Max_halfv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_SIMPLE_Max_halfv 408 bytes stack frame, 636 bytes spill stores, 992 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL128_Max_halfv 584 bytes stack frame, 468 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_TREE_LL_Max_halfv 336 bytes stack frame, 564 bytes spill stores, 716 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_TREE_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_NVLS_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_halfv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_halfv 520 bytes stack frame, 1020 bytes spill stores, 1404 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_SIMPLE_Max_halfv 312 bytes stack frame, 340 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL128_Max_halfv 544 bytes stack frame, 424 bytes spill stores, 524 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_RING_LL_Max_halfv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_SIMPLE_Max_halfv 408 bytes stack frame, 660 bytes spill stores, 1036 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL128_Max_halfv 640 bytes stack frame, 528 bytes spill stores, 732 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_TREE_LL_Max_halfv 360 bytes stack frame, 584 bytes spill stores, 804 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_TREE_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_SIMPLE_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_NVLS_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_halfv 256 bytes stack frame, 276 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_halfv 576 bytes stack frame, 1284 bytes spill stores, 2044 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_SIMPLE_Max_halfv 400 bytes stack frame, 476 bytes spill stores, 1012 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL128_Max_halfv 560 bytes stack frame, 416 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_RING_LL_Max_halfv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_SIMPLE_Max_halfv 504 bytes stack frame, 1008 bytes spill stores, 1580 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL128_Max_halfv 560 bytes stack frame, 408 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_TREE_LL_Max_halfv 360 bytes stack frame, 572 bytes spill stores, 788 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_halfv 520 bytes stack frame, 952 bytes spill stores, 1388 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_TREE_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_SIMPLE_Max_halfv 416 bytes stack frame, 716 bytes spill stores, 1100 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_NVLS_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_halfv 240 bytes stack frame, 264 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_halfv 568 bytes stack frame, 1320 bytes spill stores, 2000 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_SIMPLE_Max_halfv 320 bytes stack frame, 352 bytes spill stores, 612 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL128_Max_halfv 504 bytes stack frame, 360 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_RING_LL_Max_halfv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_SIMPLE_Max_halfv 472 bytes stack frame, 888 bytes spill stores, 1296 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL128_Max_halfv 480 bytes stack frame, 288 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_TREE_LL_Max_halfv 352 bytes stack frame, 540 bytes spill stores, 748 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_uint32_tv 208 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_uint32_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_PreMulSum_uint32_tv 520 bytes stack frame, 380 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_PreMulSum_uint32_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_uint32_tv 208 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_uint32_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_PreMulSum_uint32_tv 520 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_PreMulSum_uint32_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_uint32_tv 208 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_uint32_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_PreMulSum_uint32_tv 520 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_PreMulSum_uint32_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_uint32_tv 224 bytes stack frame, 220 bytes spill stores, 220 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_uint32_tv 312 bytes stack frame, 344 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_PreMulSum_uint32_tv 560 bytes stack frame, 428 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_PreMulSum_uint32_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_uint32_tv 312 bytes stack frame, 328 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_uint32_tv 408 bytes stack frame, 456 bytes spill stores, 840 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_PreMulSum_uint32_tv 560 bytes stack frame, 432 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_PreMulSum_uint32_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_uint32_tv 256 bytes stack frame, 268 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_uint32_tv 312 bytes stack frame, 344 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_PreMulSum_uint32_tv 520 bytes stack frame, 384 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_PreMulSum_uint32_tv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_PreMulSum_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_f64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=8 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_f64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_f64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_doublev 216 bytes stack frame, 212 bytes spill stores, 212 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_doublev 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_RING_LL128_PreMulSum_doublev 504 bytes stack frame, 352 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_LL_PreMulSum_doublev 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_doublev 216 bytes stack frame, 212 bytes spill stores, 212 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_doublev 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_RING_LL128_PreMulSum_doublev 504 bytes stack frame, 352 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_LL_PreMulSum_doublev 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_doublev 216 bytes stack frame, 212 bytes spill stores, 212 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_doublev 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_RING_LL128_PreMulSum_doublev 504 bytes stack frame, 352 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_LL_PreMulSum_doublev 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_doublev 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_doublev 296 bytes stack frame, 324 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_RING_LL128_PreMulSum_doublev 560 bytes stack frame, 416 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_LL_PreMulSum_doublev 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_doublev 304 bytes stack frame, 320 bytes spill stores, 332 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_doublev 384 bytes stack frame, 424 bytes spill stores, 644 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_RING_LL128_PreMulSum_doublev 576 bytes stack frame, 448 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_LL_PreMulSum_doublev 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_doublev 248 bytes stack frame, 260 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_doublev 280 bytes stack frame, 300 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_RING_LL128_PreMulSum_doublev 528 bytes stack frame, 392 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_LL_PreMulSum_doublev 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_LL_PreMulSum_doublev 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_halfv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_halfv 512 bytes stack frame, 1080 bytes spill stores, 1292 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_halfv 240 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL128_PreMulSum_halfv 488 bytes stack frame, 352 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL_PreMulSum_halfv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_halfv 400 bytes stack frame, 648 bytes spill stores, 1040 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL128_PreMulSum_halfv 592 bytes stack frame, 496 bytes spill stores, 664 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL_PreMulSum_halfv 344 bytes stack frame, 568 bytes spill stores, 728 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_halfv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_halfv 464 bytes stack frame, 956 bytes spill stores, 1224 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_halfv 224 bytes stack frame, 244 bytes spill stores, 244 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL128_PreMulSum_halfv 512 bytes stack frame, 376 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL_PreMulSum_halfv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_halfv 328 bytes stack frame, 468 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL128_PreMulSum_halfv 504 bytes stack frame, 356 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL_PreMulSum_halfv 328 bytes stack frame, 544 bytes spill stores, 676 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_halfv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_halfv 464 bytes stack frame, 948 bytes spill stores, 1160 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_halfv 240 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL128_PreMulSum_halfv 488 bytes stack frame, 352 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL_PreMulSum_halfv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_halfv 400 bytes stack frame, 648 bytes spill stores, 1040 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL128_PreMulSum_halfv 592 bytes stack frame, 468 bytes spill stores, 644 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL_PreMulSum_halfv 344 bytes stack frame, 568 bytes spill stores, 728 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_halfv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_halfv 496 bytes stack frame, 1000 bytes spill stores, 1252 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_halfv 288 bytes stack frame, 312 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL128_PreMulSum_halfv 568 bytes stack frame, 436 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL_PreMulSum_halfv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_halfv 328 bytes stack frame, 476 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL128_PreMulSum_halfv 568 bytes stack frame, 408 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL_PreMulSum_halfv 352 bytes stack frame, 564 bytes spill stores, 768 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_halfv 256 bytes stack frame, 284 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_halfv 640 bytes stack frame, 1392 bytes spill stores, 2124 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_halfv 384 bytes stack frame, 448 bytes spill stores, 1064 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL128_PreMulSum_halfv 568 bytes stack frame, 440 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL_PreMulSum_halfv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_halfv 520 bytes stack frame, 1064 bytes spill stores, 1632 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL128_PreMulSum_halfv 560 bytes stack frame, 408 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL_PreMulSum_halfv 368 bytes stack frame, 576 bytes spill stores, 792 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_halfv 240 bytes stack frame, 260 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_halfv 600 bytes stack frame, 1344 bytes spill stores, 2156 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_halfv 312 bytes stack frame, 360 bytes spill stores, 652 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL128_PreMulSum_halfv 512 bytes stack frame, 368 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL_PreMulSum_halfv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_halfv 448 bytes stack frame, 1044 bytes spill stores, 1592 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL128_PreMulSum_halfv 488 bytes stack frame, 304 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL_PreMulSum_halfv 352 bytes stack frame, 540 bytes spill stores, 776 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z44ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z44ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclKernel_AllReduce_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z48ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z49ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclKernel_AllReduce_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 344 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z39ncclKernel_AllReduce_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 168 bytes stack frame, 348 bytes spill stores, 556 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_int8_tv 216 bytes stack frame, 236 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_int8_tv 504 bytes stack frame, 1184 bytes spill stores, 1760 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Sum_int8_tv 288 bytes stack frame, 312 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Sum_int8_tv 464 bytes stack frame, 312 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Sum_int8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Sum_int8_tv 464 bytes stack frame, 976 bytes spill stores, 1440 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Sum_int8_tv 496 bytes stack frame, 332 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Sum_int8_tv 344 bytes stack frame, 580 bytes spill stores, 732 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z44ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z44ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclKernel_AllReduce_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z48ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z49ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclKernel_AllReduce_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 344 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z39ncclKernel_AllReduce_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 168 bytes stack frame, 348 bytes spill stores, 556 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_int8_tv 216 bytes stack frame, 236 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_int8_tv 504 bytes stack frame, 1184 bytes spill stores, 1760 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Sum_int8_tv 288 bytes stack frame, 312 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Sum_int8_tv 464 bytes stack frame, 312 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Sum_int8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Sum_int8_tv 464 bytes stack frame, 976 bytes spill stores, 1440 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Sum_int8_tv 496 bytes stack frame, 332 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Sum_int8_tv 344 bytes stack frame, 580 bytes spill stores, 732 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z44ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z44ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclKernel_AllReduce_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z48ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z49ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclKernel_AllReduce_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 344 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z39ncclKernel_AllReduce_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 168 bytes stack frame, 348 bytes spill stores, 556 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_int8_tv 216 bytes stack frame, 236 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_int8_tv 504 bytes stack frame, 1184 bytes spill stores, 1760 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Sum_int8_tv 288 bytes stack frame, 312 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Sum_int8_tv 464 bytes stack frame, 312 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Sum_int8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Sum_int8_tv 464 bytes stack frame, 976 bytes spill stores, 1440 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Sum_int8_tv 496 bytes stack frame, 332 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Sum_int8_tv 344 bytes stack frame, 580 bytes spill stores, 732 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z44ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z44ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclKernel_AllReduce_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z48ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z49ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclKernel_AllReduce_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z39ncclKernel_AllReduce_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 176 bytes stack frame, 340 bytes spill stores, 576 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_int8_tv 232 bytes stack frame, 252 bytes spill stores, 244 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_int8_tv 544 bytes stack frame, 1176 bytes spill stores, 1668 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Sum_int8_tv 336 bytes stack frame, 384 bytes spill stores, 632 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Sum_int8_tv 520 bytes stack frame, 372 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Sum_int8_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Sum_int8_tv 560 bytes stack frame, 1140 bytes spill stores, 1588 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Sum_int8_tv 560 bytes stack frame, 408 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Sum_int8_tv 368 bytes stack frame, 576 bytes spill stores, 816 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z44ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z44ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclKernel_AllReduce_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z48ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z48ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z49ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclKernel_AllReduce_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z39ncclKernel_AllReduce_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 176 bytes stack frame, 340 bytes spill stores, 580 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_int8_tv 376 bytes stack frame, 640 bytes spill stores, 1324 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_int8_tv 680 bytes stack frame, 1980 bytes spill stores, 3516 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Sum_int8_tv 528 bytes stack frame, 856 bytes spill stores, 1832 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Sum_int8_tv 520 bytes stack frame, 360 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Sum_int8_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Sum_int8_tv 936 bytes stack frame, 1908 bytes spill stores, 2608 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Sum_int8_tv 560 bytes stack frame, 408 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Sum_int8_tv 384 bytes stack frame, 592 bytes spill stores, 840 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z44ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z44ncclKernel_AllReduce_NVLS_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclKernel_AllReduce_NVLS_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z48ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z48ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z49ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z49ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclKernel_AllReduce_RING_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_Z39ncclKernel_AllReduce_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z39ncclKernel_AllReduce_TREE_LL_Sum_int8_tP11ncclDevCommmP8ncclWork 160 bytes stack frame, 304 bytes spill stores, 528 bytes spill loads ptxas info : Used 96 registers ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_int8_tv 360 bytes stack frame, 640 bytes spill stores, 1476 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_int8_tv 672 bytes stack frame, 2020 bytes spill stores, 3800 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Sum_int8_tv 400 bytes stack frame, 640 bytes spill stores, 1352 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Sum_int8_tv 464 bytes stack frame, 312 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Sum_int8_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Sum_int8_tv 880 bytes stack frame, 1860 bytes spill stores, 2548 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Sum_int8_tv 456 bytes stack frame, 256 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Sum_int8_tv 368 bytes stack frame, 552 bytes spill stores, 860 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_int32_tv 208 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_int32_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_PreMulSum_int32_tv 520 bytes stack frame, 380 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_PreMulSum_int32_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_int32_tv 208 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_int32_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_PreMulSum_int32_tv 520 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_PreMulSum_int32_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_int32_tv 208 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_int32_tv 232 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_PreMulSum_int32_tv 520 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_PreMulSum_int32_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_int32_tv 224 bytes stack frame, 220 bytes spill stores, 220 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_int32_tv 312 bytes stack frame, 344 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_PreMulSum_int32_tv 560 bytes stack frame, 428 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_PreMulSum_int32_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_int32_tv 312 bytes stack frame, 328 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_int32_tv 408 bytes stack frame, 456 bytes spill stores, 840 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_PreMulSum_int32_tv 560 bytes stack frame, 432 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_PreMulSum_int32_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_int32_tv 256 bytes stack frame, 268 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_int32_tv 312 bytes stack frame, 344 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_PreMulSum_int32_tv 520 bytes stack frame, 384 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_PreMulSum_int32_tv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_PreMulSum_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_int8_tv 216 bytes stack frame, 236 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_int8_tv 512 bytes stack frame, 1156 bytes spill stores, 1648 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Prod_int8_tv 288 bytes stack frame, 312 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Prod_int8_tv 464 bytes stack frame, 312 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Prod_int8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Prod_int8_tv 488 bytes stack frame, 904 bytes spill stores, 1320 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Prod_int8_tv 816 bytes stack frame, 416 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Prod_int8_tv 344 bytes stack frame, 580 bytes spill stores, 744 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_int8_tv 216 bytes stack frame, 236 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_int8_tv 512 bytes stack frame, 1156 bytes spill stores, 1648 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Prod_int8_tv 288 bytes stack frame, 312 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Prod_int8_tv 464 bytes stack frame, 312 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Prod_int8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Prod_int8_tv 488 bytes stack frame, 904 bytes spill stores, 1320 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Prod_int8_tv 816 bytes stack frame, 416 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Prod_int8_tv 344 bytes stack frame, 580 bytes spill stores, 744 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_int8_tv 216 bytes stack frame, 236 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_int8_tv 512 bytes stack frame, 1156 bytes spill stores, 1648 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Prod_int8_tv 288 bytes stack frame, 312 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Prod_int8_tv 464 bytes stack frame, 312 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Prod_int8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Prod_int8_tv 488 bytes stack frame, 904 bytes spill stores, 1320 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Prod_int8_tv 816 bytes stack frame, 416 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Prod_int8_tv 344 bytes stack frame, 580 bytes spill stores, 744 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_int8_tv 232 bytes stack frame, 252 bytes spill stores, 244 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_int8_tv 552 bytes stack frame, 1180 bytes spill stores, 1692 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Prod_int8_tv 336 bytes stack frame, 384 bytes spill stores, 632 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Prod_int8_tv 544 bytes stack frame, 416 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Prod_int8_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Prod_int8_tv 536 bytes stack frame, 972 bytes spill stores, 1452 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Prod_int8_tv 856 bytes stack frame, 476 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Prod_int8_tv 376 bytes stack frame, 588 bytes spill stores, 848 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_int8_tv 376 bytes stack frame, 640 bytes spill stores, 1324 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_int8_tv 688 bytes stack frame, 1840 bytes spill stores, 3188 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Prod_int8_tv 536 bytes stack frame, 852 bytes spill stores, 1828 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Prod_int8_tv 544 bytes stack frame, 396 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Prod_int8_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Prod_int8_tv 1056 bytes stack frame, 2552 bytes spill stores, 3260 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Prod_int8_tv 856 bytes stack frame, 480 bytes spill stores, 664 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Prod_int8_tv 384 bytes stack frame, 592 bytes spill stores, 840 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod_int8_tv 360 bytes stack frame, 632 bytes spill stores, 1468 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod_int8_tv 672 bytes stack frame, 2024 bytes spill stores, 3816 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Prod_int8_tv 416 bytes stack frame, 644 bytes spill stores, 1348 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Prod_int8_tv 496 bytes stack frame, 344 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Prod_int8_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Prod_int8_tv 1032 bytes stack frame, 2644 bytes spill stores, 3372 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Prod_int8_tv 792 bytes stack frame, 384 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Prod_int8_tv 368 bytes stack frame, 560 bytes spill stores, 868 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_f32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=7 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_f32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_f32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_floatv 208 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_floatv 240 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL128_PreMulSum_floatv 520 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL_PreMulSum_floatv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_floatv 208 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_floatv 240 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL128_PreMulSum_floatv 520 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL_PreMulSum_floatv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_floatv 208 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_floatv 240 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL128_PreMulSum_floatv 520 bytes stack frame, 388 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL_PreMulSum_floatv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_floatv 224 bytes stack frame, 220 bytes spill stores, 220 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_floatv 312 bytes stack frame, 344 bytes spill stores, 384 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL128_PreMulSum_floatv 560 bytes stack frame, 428 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL_PreMulSum_floatv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_floatv 304 bytes stack frame, 316 bytes spill stores, 316 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_floatv 408 bytes stack frame, 456 bytes spill stores, 840 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL128_PreMulSum_floatv 560 bytes stack frame, 432 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL_PreMulSum_floatv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_floatv 248 bytes stack frame, 260 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_floatv 312 bytes stack frame, 344 bytes spill stores, 472 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL128_PreMulSum_floatv 520 bytes stack frame, 384 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_RING_LL_PreMulSum_floatv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_ReduceScatter_TREE_LL_PreMulSum_floatv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z40ncclKernel_AllReduce_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z40ncclKernel_AllReduce_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 344 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z40ncclKernel_AllReduce_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 168 bytes stack frame, 348 bytes spill stores, 556 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_uint8_tv 216 bytes stack frame, 236 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_uint8_tv 504 bytes stack frame, 1184 bytes spill stores, 1760 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Sum_uint8_tv 288 bytes stack frame, 312 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Sum_uint8_tv 464 bytes stack frame, 312 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Sum_uint8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Sum_uint8_tv 464 bytes stack frame, 976 bytes spill stores, 1440 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Sum_uint8_tv 496 bytes stack frame, 332 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Sum_uint8_tv 344 bytes stack frame, 580 bytes spill stores, 732 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z40ncclKernel_AllReduce_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z40ncclKernel_AllReduce_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 344 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z40ncclKernel_AllReduce_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 168 bytes stack frame, 348 bytes spill stores, 556 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_uint8_tv 216 bytes stack frame, 236 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_uint8_tv 504 bytes stack frame, 1184 bytes spill stores, 1760 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Sum_uint8_tv 288 bytes stack frame, 312 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Sum_uint8_tv 464 bytes stack frame, 312 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Sum_uint8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Sum_uint8_tv 464 bytes stack frame, 976 bytes spill stores, 1440 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Sum_uint8_tv 496 bytes stack frame, 332 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Sum_uint8_tv 344 bytes stack frame, 580 bytes spill stores, 732 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z40ncclKernel_AllReduce_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z40ncclKernel_AllReduce_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 94 registers, 344 bytes cmem[0], 8 bytes cmem[2] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z40ncclKernel_AllReduce_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 168 bytes stack frame, 348 bytes spill stores, 556 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 8 bytes cmem[2] ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_uint8_tv 216 bytes stack frame, 236 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_uint8_tv 504 bytes stack frame, 1184 bytes spill stores, 1760 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Sum_uint8_tv 288 bytes stack frame, 312 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Sum_uint8_tv 464 bytes stack frame, 312 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Sum_uint8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Sum_uint8_tv 464 bytes stack frame, 976 bytes spill stores, 1440 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Sum_uint8_tv 496 bytes stack frame, 332 bytes spill stores, 292 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Sum_uint8_tv 344 bytes stack frame, 580 bytes spill stores, 732 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z40ncclKernel_AllReduce_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z40ncclKernel_AllReduce_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z40ncclKernel_AllReduce_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 176 bytes stack frame, 340 bytes spill stores, 576 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_uint8_tv 232 bytes stack frame, 252 bytes spill stores, 244 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_uint8_tv 544 bytes stack frame, 1176 bytes spill stores, 1668 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Sum_uint8_tv 336 bytes stack frame, 384 bytes spill stores, 632 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Sum_uint8_tv 520 bytes stack frame, 372 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Sum_uint8_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Sum_uint8_tv 560 bytes stack frame, 1140 bytes spill stores, 1588 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Sum_uint8_tv 560 bytes stack frame, 408 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Sum_uint8_tv 368 bytes stack frame, 576 bytes spill stores, 816 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z40ncclKernel_AllReduce_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z40ncclKernel_AllReduce_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z40ncclKernel_AllReduce_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 176 bytes stack frame, 340 bytes spill stores, 580 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_uint8_tv 376 bytes stack frame, 640 bytes spill stores, 1324 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_uint8_tv 680 bytes stack frame, 1980 bytes spill stores, 3516 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Sum_uint8_tv 528 bytes stack frame, 856 bytes spill stores, 1832 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Sum_uint8_tv 520 bytes stack frame, 360 bytes spill stores, 476 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Sum_uint8_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Sum_uint8_tv 936 bytes stack frame, 1908 bytes spill stores, 2608 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Sum_uint8_tv 560 bytes stack frame, 408 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Sum_uint8_tv 384 bytes stack frame, 592 bytes spill stores, 840 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z45ncclKernel_AllReduce_NVLS_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z40ncclKernel_AllReduce_NVLS_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z49ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z50ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z40ncclKernel_AllReduce_RING_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_Z40ncclKernel_AllReduce_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z40ncclKernel_AllReduce_TREE_LL_Sum_uint8_tP11ncclDevCommmP8ncclWork 160 bytes stack frame, 304 bytes spill stores, 528 bytes spill loads ptxas info : Used 96 registers ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_uint8_tv 360 bytes stack frame, 640 bytes spill stores, 1476 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_uint8_tv 672 bytes stack frame, 2020 bytes spill stores, 3800 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Sum_uint8_tv 400 bytes stack frame, 640 bytes spill stores, 1352 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Sum_uint8_tv 464 bytes stack frame, 312 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Sum_uint8_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Sum_uint8_tv 880 bytes stack frame, 1860 bytes spill stores, 2548 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Sum_uint8_tv 456 bytes stack frame, 256 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Sum_uint8_tv 368 bytes stack frame, 552 bytes spill stores, 860 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_int64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_int64_tv 280 bytes stack frame, 308 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_int64_tv 520 bytes stack frame, 388 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_SumPostDiv_int64_tv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_int64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_int64_tv 280 bytes stack frame, 308 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_int64_tv 520 bytes stack frame, 388 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_SumPostDiv_int64_tv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_int64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_int64_tv 280 bytes stack frame, 308 bytes spill stores, 340 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_int64_tv 520 bytes stack frame, 388 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_SumPostDiv_int64_tv 136 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_int64_tv 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_int64_tv 352 bytes stack frame, 396 bytes spill stores, 480 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_int64_tv 560 bytes stack frame, 436 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_SumPostDiv_int64_tv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_int64_tv 336 bytes stack frame, 348 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_int64_tv 392 bytes stack frame, 448 bytes spill stores, 840 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_int64_tv 576 bytes stack frame, 456 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_SumPostDiv_int64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_int64_tv 288 bytes stack frame, 300 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_int64_tv 344 bytes stack frame, 372 bytes spill stores, 572 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_int64_tv 536 bytes stack frame, 416 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_SumPostDiv_int64_tv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z42ncclKernel_AllReduce_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z42ncclKernel_AllReduce_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_AllReduce_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z37ncclKernel_AllReduce_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z46ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z47ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_AllReduce_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z37ncclKernel_AllReduce_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z37ncclKernel_AllReduce_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z37ncclKernel_AllReduce_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 160 bytes stack frame, 344 bytes spill stores, 552 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_TREE_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_NVLS_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_halfv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_halfv 480 bytes stack frame, 964 bytes spill stores, 1268 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_SIMPLE_Sum_halfv 272 bytes stack frame, 296 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL128_Sum_halfv 496 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_RING_LL_Sum_halfv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_SIMPLE_Sum_halfv 400 bytes stack frame, 636 bytes spill stores, 1000 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL128_Sum_halfv 584 bytes stack frame, 468 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_TREE_LL_Sum_halfv 336 bytes stack frame, 564 bytes spill stores, 716 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z42ncclKernel_AllReduce_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z42ncclKernel_AllReduce_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_AllReduce_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z37ncclKernel_AllReduce_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z46ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z47ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_AllReduce_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z37ncclKernel_AllReduce_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z37ncclKernel_AllReduce_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z37ncclKernel_AllReduce_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 144 bytes stack frame, 308 bytes spill stores, 492 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_TREE_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_NVLS_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_halfv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_halfv 488 bytes stack frame, 1008 bytes spill stores, 1252 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_SIMPLE_Sum_halfv 224 bytes stack frame, 240 bytes spill stores, 232 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL128_Sum_halfv 536 bytes stack frame, 396 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_RING_LL_Sum_halfv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_SIMPLE_Sum_halfv 296 bytes stack frame, 404 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL128_Sum_halfv 504 bytes stack frame, 356 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_TREE_LL_Sum_halfv 328 bytes stack frame, 536 bytes spill stores, 656 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z42ncclKernel_AllReduce_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z42ncclKernel_AllReduce_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_AllReduce_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z37ncclKernel_AllReduce_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z46ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z47ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_AllReduce_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z37ncclKernel_AllReduce_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Compiling entry function '_Z37ncclKernel_AllReduce_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z37ncclKernel_AllReduce_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 160 bytes stack frame, 344 bytes spill stores, 552 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 4 bytes cmem[2] ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_TREE_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_NVLS_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_halfv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_halfv 488 bytes stack frame, 976 bytes spill stores, 1296 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_SIMPLE_Sum_halfv 272 bytes stack frame, 296 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL128_Sum_halfv 496 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_RING_LL_Sum_halfv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_SIMPLE_Sum_halfv 400 bytes stack frame, 636 bytes spill stores, 1000 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL128_Sum_halfv 584 bytes stack frame, 468 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_TREE_LL_Sum_halfv 336 bytes stack frame, 564 bytes spill stores, 716 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z42ncclKernel_AllReduce_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z42ncclKernel_AllReduce_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_AllReduce_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z37ncclKernel_AllReduce_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z46ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z47ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_AllReduce_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z37ncclKernel_AllReduce_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_AllReduce_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z37ncclKernel_AllReduce_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 160 bytes stack frame, 312 bytes spill stores, 548 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_TREE_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_NVLS_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_halfv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_halfv 528 bytes stack frame, 1036 bytes spill stores, 1416 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_SIMPLE_Sum_halfv 280 bytes stack frame, 308 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL128_Sum_halfv 576 bytes stack frame, 484 bytes spill stores, 620 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_RING_LL_Sum_halfv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_SIMPLE_Sum_halfv 312 bytes stack frame, 456 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL128_Sum_halfv 560 bytes stack frame, 408 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_TREE_LL_Sum_halfv 352 bytes stack frame, 560 bytes spill stores, 752 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z42ncclKernel_AllReduce_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z42ncclKernel_AllReduce_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_AllReduce_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z37ncclKernel_AllReduce_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z46ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z47ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z47ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_AllReduce_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z37ncclKernel_AllReduce_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z37ncclKernel_AllReduce_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z37ncclKernel_AllReduce_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 160 bytes stack frame, 328 bytes spill stores, 564 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_TREE_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_SIMPLE_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_NVLS_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_halfv 256 bytes stack frame, 276 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_halfv 576 bytes stack frame, 1284 bytes spill stores, 2044 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_SIMPLE_Sum_halfv 400 bytes stack frame, 476 bytes spill stores, 1012 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL128_Sum_halfv 560 bytes stack frame, 416 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_RING_LL_Sum_halfv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_SIMPLE_Sum_halfv 504 bytes stack frame, 1008 bytes spill stores, 1580 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL128_Sum_halfv 560 bytes stack frame, 408 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_TREE_LL_Sum_halfv 360 bytes stack frame, 572 bytes spill stores, 788 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z42ncclKernel_AllReduce_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z42ncclKernel_AllReduce_NVLS_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z37ncclKernel_AllReduce_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z37ncclKernel_AllReduce_NVLS_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z46ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z47ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z47ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z37ncclKernel_AllReduce_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z37ncclKernel_AllReduce_RING_LL_Sum_halfP11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_Z37ncclKernel_AllReduce_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z37ncclKernel_AllReduce_TREE_LL_Sum_halfP11ncclDevCommmP8ncclWork 136 bytes stack frame, 276 bytes spill stores, 488 bytes spill loads ptxas info : Used 96 registers ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum_halfv 520 bytes stack frame, 952 bytes spill stores, 1388 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_TREE_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_NVLS_SIMPLE_Sum_halfv 416 bytes stack frame, 716 bytes spill stores, 1100 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_NVLS_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_halfv 240 bytes stack frame, 264 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_halfv 568 bytes stack frame, 1320 bytes spill stores, 2000 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_RING_SIMPLE_Sum_halfv 320 bytes stack frame, 352 bytes spill stores, 612 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL128_Sum_halfv 504 bytes stack frame, 360 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_RING_LL_Sum_halfv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z43ncclFunction_AllReduce_TREE_SIMPLE_Sum_halfv 472 bytes stack frame, 888 bytes spill stores, 1296 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL128_Sum_halfv 480 bytes stack frame, 288 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z39ncclFunction_AllReduce_TREE_LL_Sum_halfv 352 bytes stack frame, 540 bytes spill stores, 748 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_Max___nv_bfloat16v 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_Max___nv_bfloat16v 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_Max___nv_bfloat16v 520 bytes stack frame, 396 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_Max___nv_bfloat16v 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_Max___nv_bfloat16v 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_Max___nv_bfloat16v 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_Max___nv_bfloat16v 520 bytes stack frame, 396 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_Max___nv_bfloat16v 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_Max___nv_bfloat16v 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_Max___nv_bfloat16v 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_Max___nv_bfloat16v 520 bytes stack frame, 396 bytes spill stores, 460 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_Max___nv_bfloat16v 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_Max___nv_bfloat16v 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_Max___nv_bfloat16v 264 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_Max___nv_bfloat16v 560 bytes stack frame, 428 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_Max___nv_bfloat16v 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_Max___nv_bfloat16v 336 bytes stack frame, 348 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_Max___nv_bfloat16v 384 bytes stack frame, 440 bytes spill stores, 660 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_Max___nv_bfloat16v 568 bytes stack frame, 432 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_Max___nv_bfloat16v 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_Max___nv_bfloat16v 272 bytes stack frame, 288 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_Max___nv_bfloat16v 280 bytes stack frame, 304 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_Max___nv_bfloat16v 536 bytes stack frame, 400 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_Max___nv_bfloat16v 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_i32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=2 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_i32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_i32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_int32_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_int32_tv 248 bytes stack frame, 272 bytes spill stores, 268 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_int32_tv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_SumPostDiv_int32_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_int32_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_int32_tv 248 bytes stack frame, 272 bytes spill stores, 268 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_int32_tv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_SumPostDiv_int32_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_int32_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_int32_tv 248 bytes stack frame, 272 bytes spill stores, 268 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_int32_tv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_SumPostDiv_int32_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_int32_tv 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_int32_tv 312 bytes stack frame, 340 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_int32_tv 552 bytes stack frame, 420 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_SumPostDiv_int32_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_int32_tv 336 bytes stack frame, 348 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_int32_tv 416 bytes stack frame, 476 bytes spill stores, 844 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_int32_tv 560 bytes stack frame, 432 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_SumPostDiv_int32_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_int32_tv 288 bytes stack frame, 300 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_int32_tv 320 bytes stack frame, 364 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_int32_tv 528 bytes stack frame, 400 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_SumPostDiv_int32_tv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_int32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_u32.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=3 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_u32.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_u32.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_uint32_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z68ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_uint32_tv 240 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_uint32_tv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_RING_LL_SumPostDiv_uint32_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_uint32_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z68ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_uint32_tv 240 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_uint32_tv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_RING_LL_SumPostDiv_uint32_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_uint32_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z68ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_uint32_tv 240 bytes stack frame, 260 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_uint32_tv 504 bytes stack frame, 364 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_RING_LL_SumPostDiv_uint32_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_uint32_tv 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z68ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_uint32_tv 304 bytes stack frame, 332 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_uint32_tv 552 bytes stack frame, 420 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_RING_LL_SumPostDiv_uint32_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_uint32_tv 336 bytes stack frame, 348 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z68ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_uint32_tv 408 bytes stack frame, 464 bytes spill stores, 832 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_uint32_tv 560 bytes stack frame, 432 bytes spill stores, 528 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_RING_LL_SumPostDiv_uint32_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_uint32_tv 288 bytes stack frame, 300 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z68ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_uint32_tv 320 bytes stack frame, 360 bytes spill stores, 480 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_uint32_tv 528 bytes stack frame, 400 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_RING_LL_SumPostDiv_uint32_tv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_uint32_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_uint64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z68ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_uint64_tv 264 bytes stack frame, 292 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_uint64_tv 520 bytes stack frame, 388 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_RING_LL_SumPostDiv_uint64_tv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_uint64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z68ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_uint64_tv 264 bytes stack frame, 292 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_uint64_tv 520 bytes stack frame, 388 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_RING_LL_SumPostDiv_uint64_tv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_uint64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z68ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_uint64_tv 264 bytes stack frame, 292 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_uint64_tv 520 bytes stack frame, 388 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_RING_LL_SumPostDiv_uint64_tv 128 bytes stack frame, 128 bytes spill stores, 128 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_uint64_tv 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z68ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_uint64_tv 344 bytes stack frame, 384 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_uint64_tv 560 bytes stack frame, 436 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_RING_LL_SumPostDiv_uint64_tv 168 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_uint64_tv 336 bytes stack frame, 348 bytes spill stores, 360 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z68ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_uint64_tv 384 bytes stack frame, 424 bytes spill stores, 816 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_uint64_tv 576 bytes stack frame, 456 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_RING_LL_SumPostDiv_uint64_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_uint64_tv 288 bytes stack frame, 300 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z68ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_uint64_tv 296 bytes stack frame, 324 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_uint64_tv 536 bytes stack frame, 416 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_RING_LL_SumPostDiv_uint64_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_f16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=6 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_f16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_f16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_halfv 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_halfv 232 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL128_PreMulSum_halfv 504 bytes stack frame, 364 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL_PreMulSum_halfv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_halfv 216 bytes stack frame, 220 bytes spill stores, 220 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_halfv 224 bytes stack frame, 240 bytes spill stores, 232 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL128_PreMulSum_halfv 504 bytes stack frame, 364 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL_PreMulSum_halfv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_halfv 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_halfv 232 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL128_PreMulSum_halfv 504 bytes stack frame, 364 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL_PreMulSum_halfv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_halfv 240 bytes stack frame, 240 bytes spill stores, 240 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_halfv 296 bytes stack frame, 320 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL128_PreMulSum_halfv 568 bytes stack frame, 436 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL_PreMulSum_halfv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_halfv 288 bytes stack frame, 308 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_halfv 408 bytes stack frame, 448 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL128_PreMulSum_halfv 568 bytes stack frame, 440 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL_PreMulSum_halfv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_halfv 240 bytes stack frame, 256 bytes spill stores, 244 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_halfv 296 bytes stack frame, 320 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL128_PreMulSum_halfv 520 bytes stack frame, 392 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_RING_LL_PreMulSum_halfv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_ReduceScatter_TREE_LL_PreMulSum_halfv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_i64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=4 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_i64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_i64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_int64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_int64_tv 208 bytes stack frame, 224 bytes spill stores, 216 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_PreMulSum_int64_tv 512 bytes stack frame, 352 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_PreMulSum_int64_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_int64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_int64_tv 208 bytes stack frame, 224 bytes spill stores, 216 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_PreMulSum_int64_tv 512 bytes stack frame, 352 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_PreMulSum_int64_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_int64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_int64_tv 208 bytes stack frame, 224 bytes spill stores, 216 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_PreMulSum_int64_tv 512 bytes stack frame, 352 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_PreMulSum_int64_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_int64_tv 256 bytes stack frame, 272 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_int64_tv 312 bytes stack frame, 340 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_PreMulSum_int64_tv 560 bytes stack frame, 424 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_PreMulSum_int64_tv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_int64_tv 304 bytes stack frame, 320 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_int64_tv 432 bytes stack frame, 476 bytes spill stores, 924 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_PreMulSum_int64_tv 568 bytes stack frame, 432 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_PreMulSum_int64_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_int64_tv 248 bytes stack frame, 264 bytes spill stores, 252 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_int64_tv 336 bytes stack frame, 376 bytes spill stores, 632 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_PreMulSum_int64_tv 536 bytes stack frame, 400 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_PreMulSum_int64_tv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_PreMulSum_int64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_uint8_tv 216 bytes stack frame, 236 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_uint8_tv 512 bytes stack frame, 1188 bytes spill stores, 1780 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Max_uint8_tv 288 bytes stack frame, 320 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Max_uint8_tv 480 bytes stack frame, 320 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Max_uint8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Max_uint8_tv 472 bytes stack frame, 892 bytes spill stores, 1272 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Max_uint8_tv 848 bytes stack frame, 444 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Max_uint8_tv 344 bytes stack frame, 580 bytes spill stores, 744 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_uint8_tv 216 bytes stack frame, 236 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_uint8_tv 512 bytes stack frame, 1188 bytes spill stores, 1780 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Max_uint8_tv 288 bytes stack frame, 320 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Max_uint8_tv 480 bytes stack frame, 320 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Max_uint8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Max_uint8_tv 472 bytes stack frame, 892 bytes spill stores, 1272 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Max_uint8_tv 848 bytes stack frame, 444 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Max_uint8_tv 344 bytes stack frame, 580 bytes spill stores, 744 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_uint8_tv 216 bytes stack frame, 236 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_uint8_tv 512 bytes stack frame, 1188 bytes spill stores, 1780 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Max_uint8_tv 288 bytes stack frame, 320 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Max_uint8_tv 480 bytes stack frame, 320 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Max_uint8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Max_uint8_tv 472 bytes stack frame, 892 bytes spill stores, 1272 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Max_uint8_tv 848 bytes stack frame, 444 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Max_uint8_tv 344 bytes stack frame, 580 bytes spill stores, 744 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_uint8_tv 288 bytes stack frame, 312 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_uint8_tv 544 bytes stack frame, 1160 bytes spill stores, 1636 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Max_uint8_tv 384 bytes stack frame, 420 bytes spill stores, 988 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Max_uint8_tv 544 bytes stack frame, 416 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Max_uint8_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Max_uint8_tv 512 bytes stack frame, 988 bytes spill stores, 1392 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Max_uint8_tv 880 bytes stack frame, 508 bytes spill stores, 712 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Max_uint8_tv 376 bytes stack frame, 588 bytes spill stores, 848 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_uint8_tv 376 bytes stack frame, 640 bytes spill stores, 1324 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_uint8_tv 680 bytes stack frame, 1976 bytes spill stores, 3500 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Max_uint8_tv 520 bytes stack frame, 788 bytes spill stores, 1732 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Max_uint8_tv 544 bytes stack frame, 396 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Max_uint8_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Max_uint8_tv 960 bytes stack frame, 2236 bytes spill stores, 3004 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Max_uint8_tv 848 bytes stack frame, 468 bytes spill stores, 652 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Max_uint8_tv 384 bytes stack frame, 592 bytes spill stores, 840 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_uint8_tv 360 bytes stack frame, 632 bytes spill stores, 1468 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_uint8_tv 664 bytes stack frame, 2284 bytes spill stores, 4252 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Max_uint8_tv 432 bytes stack frame, 648 bytes spill stores, 1340 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Max_uint8_tv 480 bytes stack frame, 340 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Max_uint8_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Max_uint8_tv 904 bytes stack frame, 2296 bytes spill stores, 3012 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Max_uint8_tv 784 bytes stack frame, 376 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Max_uint8_tv 368 bytes stack frame, 560 bytes spill stores, 868 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_uint8_tv 216 bytes stack frame, 236 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_uint8_tv 512 bytes stack frame, 1188 bytes spill stores, 1780 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Min_uint8_tv 288 bytes stack frame, 320 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Min_uint8_tv 480 bytes stack frame, 320 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Min_uint8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Min_uint8_tv 472 bytes stack frame, 892 bytes spill stores, 1272 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Min_uint8_tv 848 bytes stack frame, 444 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Min_uint8_tv 344 bytes stack frame, 580 bytes spill stores, 744 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_uint8_tv 216 bytes stack frame, 236 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_uint8_tv 512 bytes stack frame, 1188 bytes spill stores, 1780 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Min_uint8_tv 288 bytes stack frame, 320 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Min_uint8_tv 480 bytes stack frame, 320 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Min_uint8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Min_uint8_tv 472 bytes stack frame, 892 bytes spill stores, 1272 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Min_uint8_tv 848 bytes stack frame, 444 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Min_uint8_tv 344 bytes stack frame, 580 bytes spill stores, 744 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_uint8_tv 216 bytes stack frame, 236 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_uint8_tv 512 bytes stack frame, 1188 bytes spill stores, 1780 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Min_uint8_tv 288 bytes stack frame, 320 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Min_uint8_tv 480 bytes stack frame, 320 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Min_uint8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Min_uint8_tv 472 bytes stack frame, 892 bytes spill stores, 1272 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Min_uint8_tv 848 bytes stack frame, 444 bytes spill stores, 636 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Min_uint8_tv 344 bytes stack frame, 580 bytes spill stores, 744 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_uint8_tv 288 bytes stack frame, 312 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_uint8_tv 544 bytes stack frame, 1160 bytes spill stores, 1636 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Min_uint8_tv 384 bytes stack frame, 420 bytes spill stores, 988 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Min_uint8_tv 544 bytes stack frame, 416 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Min_uint8_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Min_uint8_tv 512 bytes stack frame, 988 bytes spill stores, 1392 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Min_uint8_tv 880 bytes stack frame, 508 bytes spill stores, 712 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Min_uint8_tv 376 bytes stack frame, 588 bytes spill stores, 848 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_uint8_tv 376 bytes stack frame, 640 bytes spill stores, 1324 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_uint8_tv 680 bytes stack frame, 1976 bytes spill stores, 3500 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Min_uint8_tv 520 bytes stack frame, 788 bytes spill stores, 1732 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Min_uint8_tv 544 bytes stack frame, 396 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Min_uint8_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Min_uint8_tv 960 bytes stack frame, 2236 bytes spill stores, 3004 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Min_uint8_tv 848 bytes stack frame, 468 bytes spill stores, 652 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Min_uint8_tv 384 bytes stack frame, 592 bytes spill stores, 840 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_TREE_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_SIMPLE_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_NVLS_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_uint8_tv 360 bytes stack frame, 632 bytes spill stores, 1468 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_uint8_tv 664 bytes stack frame, 2284 bytes spill stores, 4252 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_RING_SIMPLE_Min_uint8_tv 432 bytes stack frame, 648 bytes spill stores, 1340 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_LL128_Min_uint8_tv 480 bytes stack frame, 340 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_RING_LL_Min_uint8_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_TREE_SIMPLE_Min_uint8_tv 904 bytes stack frame, 2296 bytes spill stores, 3012 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_LL128_Min_uint8_tv 784 bytes stack frame, 376 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z42ncclFunction_AllReduce_TREE_LL_Min_uint8_tv 368 bytes stack frame, 560 bytes spill stores, 868 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_int8_tv 216 bytes stack frame, 236 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_int8_tv 504 bytes stack frame, 1192 bytes spill stores, 1764 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Min_int8_tv 288 bytes stack frame, 312 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Min_int8_tv 480 bytes stack frame, 320 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Min_int8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Min_int8_tv 472 bytes stack frame, 1012 bytes spill stores, 1476 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Min_int8_tv 840 bytes stack frame, 436 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Min_int8_tv 344 bytes stack frame, 580 bytes spill stores, 744 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_int8_tv 216 bytes stack frame, 236 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_int8_tv 504 bytes stack frame, 1192 bytes spill stores, 1764 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Min_int8_tv 288 bytes stack frame, 312 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Min_int8_tv 480 bytes stack frame, 320 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Min_int8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Min_int8_tv 472 bytes stack frame, 1012 bytes spill stores, 1476 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Min_int8_tv 840 bytes stack frame, 436 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Min_int8_tv 344 bytes stack frame, 580 bytes spill stores, 744 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_int8_tv 216 bytes stack frame, 236 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_int8_tv 504 bytes stack frame, 1192 bytes spill stores, 1764 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Min_int8_tv 288 bytes stack frame, 312 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Min_int8_tv 480 bytes stack frame, 320 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Min_int8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Min_int8_tv 472 bytes stack frame, 1012 bytes spill stores, 1476 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Min_int8_tv 840 bytes stack frame, 436 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Min_int8_tv 344 bytes stack frame, 580 bytes spill stores, 744 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_int8_tv 288 bytes stack frame, 312 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_int8_tv 544 bytes stack frame, 1196 bytes spill stores, 1712 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Min_int8_tv 384 bytes stack frame, 412 bytes spill stores, 972 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Min_int8_tv 544 bytes stack frame, 416 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Min_int8_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Min_int8_tv 536 bytes stack frame, 1144 bytes spill stores, 1660 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Min_int8_tv 872 bytes stack frame, 500 bytes spill stores, 704 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Min_int8_tv 376 bytes stack frame, 588 bytes spill stores, 848 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_int8_tv 376 bytes stack frame, 644 bytes spill stores, 1360 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_int8_tv 680 bytes stack frame, 1976 bytes spill stores, 3500 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Min_int8_tv 536 bytes stack frame, 804 bytes spill stores, 1744 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Min_int8_tv 544 bytes stack frame, 396 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Min_int8_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Min_int8_tv 928 bytes stack frame, 2236 bytes spill stores, 2964 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Min_int8_tv 856 bytes stack frame, 456 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Min_int8_tv 384 bytes stack frame, 592 bytes spill stores, 840 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min_int8_tv 368 bytes stack frame, 640 bytes spill stores, 1476 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min_int8_tv 672 bytes stack frame, 2188 bytes spill stores, 4068 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Min_int8_tv 424 bytes stack frame, 640 bytes spill stores, 1344 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Min_int8_tv 480 bytes stack frame, 340 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Min_int8_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Min_int8_tv 856 bytes stack frame, 2224 bytes spill stores, 2932 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Min_int8_tv 784 bytes stack frame, 380 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Min_int8_tv 368 bytes stack frame, 560 bytes spill stores, 868 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_u64.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=5 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_u64.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_u64.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_uint64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_uint64_tv 208 bytes stack frame, 224 bytes spill stores, 216 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_PreMulSum_uint64_tv 512 bytes stack frame, 352 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_PreMulSum_uint64_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_uint64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_uint64_tv 208 bytes stack frame, 224 bytes spill stores, 216 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_PreMulSum_uint64_tv 512 bytes stack frame, 352 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_PreMulSum_uint64_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_uint64_tv 224 bytes stack frame, 228 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_uint64_tv 208 bytes stack frame, 224 bytes spill stores, 216 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_PreMulSum_uint64_tv 512 bytes stack frame, 352 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_PreMulSum_uint64_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_uint64_tv 256 bytes stack frame, 272 bytes spill stores, 256 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_uint64_tv 312 bytes stack frame, 340 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_PreMulSum_uint64_tv 560 bytes stack frame, 424 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_PreMulSum_uint64_tv 176 bytes stack frame, 172 bytes spill stores, 172 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_uint64_tv 304 bytes stack frame, 320 bytes spill stores, 320 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_uint64_tv 432 bytes stack frame, 476 bytes spill stores, 924 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_PreMulSum_uint64_tv 568 bytes stack frame, 432 bytes spill stores, 560 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_PreMulSum_uint64_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_uint64_tv 248 bytes stack frame, 264 bytes spill stores, 252 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_uint64_tv 336 bytes stack frame, 376 bytes spill stores, 632 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_PreMulSum_uint64_tv 536 bytes stack frame, 400 bytes spill stores, 484 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_PreMulSum_uint64_tv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_PreMulSum_uint64_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_uint8_tv 256 bytes stack frame, 272 bytes spill stores, 268 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_uint8_tv 288 bytes stack frame, 316 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_PreMulSum_uint8_tv 488 bytes stack frame, 328 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_PreMulSum_uint8_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_uint8_tv 256 bytes stack frame, 272 bytes spill stores, 268 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_uint8_tv 288 bytes stack frame, 316 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_PreMulSum_uint8_tv 488 bytes stack frame, 328 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_PreMulSum_uint8_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_uint8_tv 256 bytes stack frame, 272 bytes spill stores, 268 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_uint8_tv 288 bytes stack frame, 316 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_PreMulSum_uint8_tv 488 bytes stack frame, 328 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_PreMulSum_uint8_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_uint8_tv 280 bytes stack frame, 296 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_uint8_tv 336 bytes stack frame, 384 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_PreMulSum_uint8_tv 552 bytes stack frame, 416 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_PreMulSum_uint8_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_uint8_tv 416 bytes stack frame, 468 bytes spill stores, 652 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_uint8_tv 472 bytes stack frame, 700 bytes spill stores, 1280 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_PreMulSum_uint8_tv 552 bytes stack frame, 416 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_PreMulSum_uint8_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_uint8_tv 344 bytes stack frame, 432 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_uint8_tv 376 bytes stack frame, 544 bytes spill stores, 848 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_PreMulSum_uint8_tv 520 bytes stack frame, 368 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_PreMulSum_uint8_tv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=1 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod___nv_bfloat16v 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod___nv_bfloat16v 448 bytes stack frame, 928 bytes spill stores, 1100 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_Prod___nv_bfloat16v 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_Prod___nv_bfloat16v 480 bytes stack frame, 344 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_Prod___nv_bfloat16v 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_Prod___nv_bfloat16v 296 bytes stack frame, 372 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_Prod___nv_bfloat16v 680 bytes stack frame, 272 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_Prod___nv_bfloat16v 336 bytes stack frame, 564 bytes spill stores, 716 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod___nv_bfloat16v 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod___nv_bfloat16v 448 bytes stack frame, 928 bytes spill stores, 1100 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_Prod___nv_bfloat16v 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_Prod___nv_bfloat16v 480 bytes stack frame, 344 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_Prod___nv_bfloat16v 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_Prod___nv_bfloat16v 296 bytes stack frame, 372 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_Prod___nv_bfloat16v 680 bytes stack frame, 272 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_Prod___nv_bfloat16v 336 bytes stack frame, 564 bytes spill stores, 716 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod___nv_bfloat16v 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod___nv_bfloat16v 448 bytes stack frame, 928 bytes spill stores, 1100 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_Prod___nv_bfloat16v 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_Prod___nv_bfloat16v 480 bytes stack frame, 344 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_Prod___nv_bfloat16v 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_Prod___nv_bfloat16v 296 bytes stack frame, 372 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_Prod___nv_bfloat16v 680 bytes stack frame, 272 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_Prod___nv_bfloat16v 336 bytes stack frame, 564 bytes spill stores, 716 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod___nv_bfloat16v 208 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod___nv_bfloat16v 488 bytes stack frame, 992 bytes spill stores, 1220 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_Prod___nv_bfloat16v 280 bytes stack frame, 304 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_Prod___nv_bfloat16v 536 bytes stack frame, 400 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_Prod___nv_bfloat16v 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_Prod___nv_bfloat16v 320 bytes stack frame, 432 bytes spill stores, 612 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_Prod___nv_bfloat16v 712 bytes stack frame, 308 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_Prod___nv_bfloat16v 360 bytes stack frame, 584 bytes spill stores, 804 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod___nv_bfloat16v 256 bytes stack frame, 276 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod___nv_bfloat16v 576 bytes stack frame, 1288 bytes spill stores, 2068 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_Prod___nv_bfloat16v 400 bytes stack frame, 476 bytes spill stores, 1012 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_Prod___nv_bfloat16v 560 bytes stack frame, 416 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_Prod___nv_bfloat16v 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_Prod___nv_bfloat16v 528 bytes stack frame, 1036 bytes spill stores, 1596 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_Prod___nv_bfloat16v 560 bytes stack frame, 408 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_Prod___nv_bfloat16v 360 bytes stack frame, 572 bytes spill stores, 788 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Prod___nv_bfloat16v 240 bytes stack frame, 264 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Prod___nv_bfloat16v 568 bytes stack frame, 1320 bytes spill stores, 2000 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_Prod___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_Prod___nv_bfloat16v 320 bytes stack frame, 352 bytes spill stores, 612 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_Prod___nv_bfloat16v 504 bytes stack frame, 360 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_Prod___nv_bfloat16v 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_Prod___nv_bfloat16v 472 bytes stack frame, 888 bytes spill stores, 1296 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_Prod___nv_bfloat16v 480 bytes stack frame, 288 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_Prod___nv_bfloat16v 352 bytes stack frame, 540 bytes spill stores, 748 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_int8_tv 216 bytes stack frame, 236 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_int8_tv 504 bytes stack frame, 1192 bytes spill stores, 1764 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Max_int8_tv 288 bytes stack frame, 312 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Max_int8_tv 480 bytes stack frame, 320 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Max_int8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Max_int8_tv 472 bytes stack frame, 1012 bytes spill stores, 1476 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Max_int8_tv 840 bytes stack frame, 436 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Max_int8_tv 344 bytes stack frame, 580 bytes spill stores, 744 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_int8_tv 216 bytes stack frame, 236 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_int8_tv 504 bytes stack frame, 1192 bytes spill stores, 1764 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Max_int8_tv 288 bytes stack frame, 312 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Max_int8_tv 480 bytes stack frame, 320 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Max_int8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Max_int8_tv 472 bytes stack frame, 1012 bytes spill stores, 1476 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Max_int8_tv 840 bytes stack frame, 436 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Max_int8_tv 344 bytes stack frame, 580 bytes spill stores, 744 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_int8_tv 216 bytes stack frame, 236 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_int8_tv 504 bytes stack frame, 1192 bytes spill stores, 1764 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Max_int8_tv 288 bytes stack frame, 312 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Max_int8_tv 480 bytes stack frame, 320 bytes spill stores, 392 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Max_int8_tv 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Max_int8_tv 472 bytes stack frame, 1012 bytes spill stores, 1476 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Max_int8_tv 840 bytes stack frame, 436 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Max_int8_tv 344 bytes stack frame, 580 bytes spill stores, 744 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_int8_tv 288 bytes stack frame, 312 bytes spill stores, 412 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_int8_tv 544 bytes stack frame, 1196 bytes spill stores, 1712 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Max_int8_tv 384 bytes stack frame, 412 bytes spill stores, 972 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Max_int8_tv 544 bytes stack frame, 416 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Max_int8_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Max_int8_tv 536 bytes stack frame, 1144 bytes spill stores, 1660 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Max_int8_tv 872 bytes stack frame, 500 bytes spill stores, 704 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Max_int8_tv 376 bytes stack frame, 588 bytes spill stores, 848 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_int8_tv 376 bytes stack frame, 644 bytes spill stores, 1360 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_int8_tv 680 bytes stack frame, 1976 bytes spill stores, 3500 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Max_int8_tv 536 bytes stack frame, 804 bytes spill stores, 1744 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Max_int8_tv 544 bytes stack frame, 396 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Max_int8_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Max_int8_tv 928 bytes stack frame, 2236 bytes spill stores, 2964 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Max_int8_tv 856 bytes stack frame, 456 bytes spill stores, 648 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Max_int8_tv 384 bytes stack frame, 592 bytes spill stores, 840 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_TREE_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z46ncclFunction_AllReduce_NVLS_TREE_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_NVLS_SIMPLE_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_NVLS_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_NVLS_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max_int8_tv 368 bytes stack frame, 640 bytes spill stores, 1476 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max_int8_tv 672 bytes stack frame, 2188 bytes spill stores, 4068 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_RING_SIMPLE_Max_int8_tv 424 bytes stack frame, 640 bytes spill stores, 1344 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_RING_LL128_Max_int8_tv 480 bytes stack frame, 340 bytes spill stores, 436 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_RING_LL_Max_int8_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z45ncclFunction_AllReduce_TREE_SIMPLE_Max_int8_tv 856 bytes stack frame, 2224 bytes spill stores, 2932 bytes spill loads ptxas info : Function properties for _Z44ncclFunction_AllReduce_TREE_LL128_Max_int8_tv 784 bytes stack frame, 380 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z41ncclFunction_AllReduce_TREE_LL_Max_int8_tv 368 bytes stack frame, 560 bytes spill stores, 868 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_int8_tv 256 bytes stack frame, 272 bytes spill stores, 268 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_int8_tv 288 bytes stack frame, 316 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_RING_LL128_PreMulSum_int8_tv 488 bytes stack frame, 328 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_LL_PreMulSum_int8_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_int8_tv 256 bytes stack frame, 272 bytes spill stores, 268 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_int8_tv 288 bytes stack frame, 316 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_RING_LL128_PreMulSum_int8_tv 488 bytes stack frame, 328 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_LL_PreMulSum_int8_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_int8_tv 256 bytes stack frame, 272 bytes spill stores, 268 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_int8_tv 288 bytes stack frame, 316 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_RING_LL128_PreMulSum_int8_tv 488 bytes stack frame, 328 bytes spill stores, 416 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_LL_PreMulSum_int8_tv 144 bytes stack frame, 140 bytes spill stores, 140 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_int8_tv 280 bytes stack frame, 296 bytes spill stores, 296 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_int8_tv 336 bytes stack frame, 384 bytes spill stores, 500 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_RING_LL128_PreMulSum_int8_tv 552 bytes stack frame, 416 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_LL_PreMulSum_int8_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_int8_tv 416 bytes stack frame, 468 bytes spill stores, 652 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_int8_tv 472 bytes stack frame, 700 bytes spill stores, 1280 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_RING_LL128_PreMulSum_int8_tv 552 bytes stack frame, 416 bytes spill stores, 512 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_LL_PreMulSum_int8_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum_int8_tv 344 bytes stack frame, 432 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_NVLS_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum_int8_tv 376 bytes stack frame, 544 bytes spill stores, 848 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_RING_LL128_PreMulSum_int8_tv 520 bytes stack frame, 368 bytes spill stores, 456 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_RING_LL_PreMulSum_int8_tv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_ReduceScatter_TREE_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_ReduceScatter_TREE_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_int8_tv 272 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_int8_tv 312 bytes stack frame, 368 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_int8_tv 512 bytes stack frame, 372 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_SumPostDiv_int8_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_int8_tv 272 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_int8_tv 312 bytes stack frame, 368 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_int8_tv 512 bytes stack frame, 372 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_SumPostDiv_int8_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_int8_tv 272 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_int8_tv 312 bytes stack frame, 368 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_int8_tv 512 bytes stack frame, 372 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_SumPostDiv_int8_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_int8_tv 288 bytes stack frame, 308 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_int8_tv 360 bytes stack frame, 412 bytes spill stores, 536 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_int8_tv 568 bytes stack frame, 436 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_SumPostDiv_int8_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_int8_tv 424 bytes stack frame, 492 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_int8_tv 504 bytes stack frame, 700 bytes spill stores, 1272 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_int8_tv 568 bytes stack frame, 440 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_SumPostDiv_int8_tv 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_int8_tv 416 bytes stack frame, 556 bytes spill stores, 956 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_int8_tv 432 bytes stack frame, 544 bytes spill stores, 872 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_int8_tv 528 bytes stack frame, 392 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_RING_LL_SumPostDiv_int8_tv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_uint8_tv 272 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_uint8_tv 312 bytes stack frame, 368 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_uint8_tv 512 bytes stack frame, 372 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_SumPostDiv_uint8_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_uint8_tv 272 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_uint8_tv 312 bytes stack frame, 368 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_uint8_tv 512 bytes stack frame, 372 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_SumPostDiv_uint8_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_uint8_tv 272 bytes stack frame, 288 bytes spill stores, 288 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_uint8_tv 312 bytes stack frame, 368 bytes spill stores, 452 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_uint8_tv 512 bytes stack frame, 372 bytes spill stores, 444 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_SumPostDiv_uint8_tv 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_uint8_tv 288 bytes stack frame, 308 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_uint8_tv 360 bytes stack frame, 420 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_uint8_tv 568 bytes stack frame, 444 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_SumPostDiv_uint8_tv 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_uint8_tv 424 bytes stack frame, 492 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_uint8_tv 504 bytes stack frame, 700 bytes spill stores, 1272 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_uint8_tv 568 bytes stack frame, 440 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_SumPostDiv_uint8_tv 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_TREE_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_TREE_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_NVLS_SIMPLE_SumPostDiv_uint8_tv 416 bytes stack frame, 556 bytes spill stores, 956 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_NVLS_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_NVLS_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z65ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_RING_SIMPLE_SumPostDiv_uint8_tv 416 bytes stack frame, 544 bytes spill stores, 872 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_RING_LL128_SumPostDiv_uint8_tv 528 bytes stack frame, 392 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_RING_LL_SumPostDiv_uint8_tv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_ReduceScatter_TREE_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_ReduceScatter_TREE_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_ReduceScatter_TREE_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint8_tv 216 bytes stack frame, 236 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint8_tv 520 bytes stack frame, 1144 bytes spill stores, 1672 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_uint8_tv 296 bytes stack frame, 332 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_PreMulSum_uint8_tv 464 bytes stack frame, 312 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_PreMulSum_uint8_tv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_uint8_tv 504 bytes stack frame, 1048 bytes spill stores, 1556 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_PreMulSum_uint8_tv 824 bytes stack frame, 420 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_PreMulSum_uint8_tv 352 bytes stack frame, 580 bytes spill stores, 744 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint8_tv 216 bytes stack frame, 236 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint8_tv 520 bytes stack frame, 1144 bytes spill stores, 1672 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_uint8_tv 296 bytes stack frame, 332 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_PreMulSum_uint8_tv 464 bytes stack frame, 312 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_PreMulSum_uint8_tv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_uint8_tv 504 bytes stack frame, 1048 bytes spill stores, 1556 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_PreMulSum_uint8_tv 824 bytes stack frame, 420 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_PreMulSum_uint8_tv 352 bytes stack frame, 580 bytes spill stores, 744 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint8_tv 216 bytes stack frame, 236 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint8_tv 520 bytes stack frame, 1144 bytes spill stores, 1672 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_uint8_tv 296 bytes stack frame, 332 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_PreMulSum_uint8_tv 464 bytes stack frame, 312 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_PreMulSum_uint8_tv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_uint8_tv 504 bytes stack frame, 1048 bytes spill stores, 1556 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_PreMulSum_uint8_tv 824 bytes stack frame, 420 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_PreMulSum_uint8_tv 352 bytes stack frame, 580 bytes spill stores, 744 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint8_tv 232 bytes stack frame, 252 bytes spill stores, 244 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint8_tv 552 bytes stack frame, 1212 bytes spill stores, 1752 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_uint8_tv 336 bytes stack frame, 384 bytes spill stores, 632 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_PreMulSum_uint8_tv 520 bytes stack frame, 376 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_PreMulSum_uint8_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_uint8_tv 520 bytes stack frame, 1072 bytes spill stores, 1636 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_PreMulSum_uint8_tv 856 bytes stack frame, 460 bytes spill stores, 652 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_PreMulSum_uint8_tv 384 bytes stack frame, 600 bytes spill stores, 860 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint8_tv 376 bytes stack frame, 656 bytes spill stores, 1368 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint8_tv 688 bytes stack frame, 1996 bytes spill stores, 3528 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_uint8_tv 488 bytes stack frame, 828 bytes spill stores, 1760 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_PreMulSum_uint8_tv 544 bytes stack frame, 400 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_PreMulSum_uint8_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_uint8_tv 1136 bytes stack frame, 2768 bytes spill stores, 3452 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_PreMulSum_uint8_tv 864 bytes stack frame, 468 bytes spill stores, 652 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_PreMulSum_uint8_tv 384 bytes stack frame, 612 bytes spill stores, 860 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_uint8_tv 368 bytes stack frame, 680 bytes spill stores, 1464 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_uint8_tv 672 bytes stack frame, 2092 bytes spill stores, 3860 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_uint8_tv 408 bytes stack frame, 664 bytes spill stores, 1388 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_PreMulSum_uint8_tv 472 bytes stack frame, 328 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_PreMulSum_uint8_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_uint8_tv 1112 bytes stack frame, 2820 bytes spill stores, 3516 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_PreMulSum_uint8_tv 808 bytes stack frame, 416 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_PreMulSum_uint8_tv 368 bytes stack frame, 564 bytes spill stores, 880 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int8_tv 216 bytes stack frame, 236 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int8_tv 520 bytes stack frame, 1144 bytes spill stores, 1672 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_int8_tv 296 bytes stack frame, 332 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_RING_LL128_PreMulSum_int8_tv 464 bytes stack frame, 312 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_LL_PreMulSum_int8_tv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_int8_tv 504 bytes stack frame, 1048 bytes spill stores, 1556 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_TREE_LL128_PreMulSum_int8_tv 824 bytes stack frame, 420 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_LL_PreMulSum_int8_tv 352 bytes stack frame, 580 bytes spill stores, 744 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int8_tv 216 bytes stack frame, 236 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int8_tv 520 bytes stack frame, 1144 bytes spill stores, 1672 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_int8_tv 296 bytes stack frame, 332 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_RING_LL128_PreMulSum_int8_tv 464 bytes stack frame, 312 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_LL_PreMulSum_int8_tv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_int8_tv 504 bytes stack frame, 1048 bytes spill stores, 1556 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_TREE_LL128_PreMulSum_int8_tv 824 bytes stack frame, 420 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_LL_PreMulSum_int8_tv 352 bytes stack frame, 580 bytes spill stores, 744 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int8_tv 216 bytes stack frame, 236 bytes spill stores, 228 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int8_tv 520 bytes stack frame, 1144 bytes spill stores, 1672 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_int8_tv 296 bytes stack frame, 332 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_RING_LL128_PreMulSum_int8_tv 464 bytes stack frame, 312 bytes spill stores, 368 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_LL_PreMulSum_int8_tv 152 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_int8_tv 504 bytes stack frame, 1048 bytes spill stores, 1556 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_TREE_LL128_PreMulSum_int8_tv 824 bytes stack frame, 420 bytes spill stores, 600 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_LL_PreMulSum_int8_tv 352 bytes stack frame, 580 bytes spill stores, 744 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int8_tv 232 bytes stack frame, 252 bytes spill stores, 244 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int8_tv 552 bytes stack frame, 1212 bytes spill stores, 1752 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_int8_tv 336 bytes stack frame, 384 bytes spill stores, 632 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_RING_LL128_PreMulSum_int8_tv 520 bytes stack frame, 376 bytes spill stores, 520 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_LL_PreMulSum_int8_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_int8_tv 520 bytes stack frame, 1072 bytes spill stores, 1636 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_TREE_LL128_PreMulSum_int8_tv 856 bytes stack frame, 460 bytes spill stores, 652 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_LL_PreMulSum_int8_tv 384 bytes stack frame, 600 bytes spill stores, 860 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int8_tv 376 bytes stack frame, 656 bytes spill stores, 1368 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int8_tv 688 bytes stack frame, 1996 bytes spill stores, 3528 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_int8_tv 488 bytes stack frame, 828 bytes spill stores, 1760 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_RING_LL128_PreMulSum_int8_tv 544 bytes stack frame, 400 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_LL_PreMulSum_int8_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_int8_tv 1136 bytes stack frame, 2768 bytes spill stores, 3452 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_TREE_LL128_PreMulSum_int8_tv 864 bytes stack frame, 468 bytes spill stores, 652 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_LL_PreMulSum_int8_tv 384 bytes stack frame, 612 bytes spill stores, 860 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z55ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_NVLS_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_NVLS_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum_int8_tv 368 bytes stack frame, 680 bytes spill stores, 1464 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum_int8_tv 672 bytes stack frame, 2092 bytes spill stores, 3860 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_SIMPLE_PreMulSum_int8_tv 408 bytes stack frame, 664 bytes spill stores, 1388 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_RING_LL128_PreMulSum_int8_tv 472 bytes stack frame, 328 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_RING_LL_PreMulSum_int8_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum_int8_tv 1112 bytes stack frame, 2820 bytes spill stores, 3516 bytes spill loads ptxas info : Function properties for _Z50ncclFunction_AllReduce_TREE_LL128_PreMulSum_int8_tv 808 bytes stack frame, 416 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z47ncclFunction_AllReduce_TREE_LL_PreMulSum_int8_tv 368 bytes stack frame, 564 bytes spill stores, 880 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=3 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max___nv_bfloat16v 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max___nv_bfloat16v 448 bytes stack frame, 968 bytes spill stores, 1188 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_Max___nv_bfloat16v 240 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_Max___nv_bfloat16v 480 bytes stack frame, 344 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_Max___nv_bfloat16v 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_Max___nv_bfloat16v 296 bytes stack frame, 372 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_Max___nv_bfloat16v 680 bytes stack frame, 272 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_Max___nv_bfloat16v 336 bytes stack frame, 564 bytes spill stores, 716 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max___nv_bfloat16v 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max___nv_bfloat16v 448 bytes stack frame, 968 bytes spill stores, 1188 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_Max___nv_bfloat16v 240 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_Max___nv_bfloat16v 480 bytes stack frame, 344 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_Max___nv_bfloat16v 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_Max___nv_bfloat16v 296 bytes stack frame, 372 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_Max___nv_bfloat16v 680 bytes stack frame, 272 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_Max___nv_bfloat16v 336 bytes stack frame, 564 bytes spill stores, 716 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max___nv_bfloat16v 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max___nv_bfloat16v 448 bytes stack frame, 968 bytes spill stores, 1188 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_Max___nv_bfloat16v 240 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_Max___nv_bfloat16v 480 bytes stack frame, 344 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_Max___nv_bfloat16v 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_Max___nv_bfloat16v 296 bytes stack frame, 372 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_Max___nv_bfloat16v 680 bytes stack frame, 272 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_Max___nv_bfloat16v 336 bytes stack frame, 564 bytes spill stores, 716 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max___nv_bfloat16v 208 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max___nv_bfloat16v 488 bytes stack frame, 992 bytes spill stores, 1220 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_Max___nv_bfloat16v 280 bytes stack frame, 308 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_Max___nv_bfloat16v 536 bytes stack frame, 400 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_Max___nv_bfloat16v 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_Max___nv_bfloat16v 320 bytes stack frame, 432 bytes spill stores, 612 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_Max___nv_bfloat16v 712 bytes stack frame, 308 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_Max___nv_bfloat16v 360 bytes stack frame, 584 bytes spill stores, 804 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max___nv_bfloat16v 256 bytes stack frame, 276 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max___nv_bfloat16v 576 bytes stack frame, 1284 bytes spill stores, 2044 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_Max___nv_bfloat16v 400 bytes stack frame, 476 bytes spill stores, 1012 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_Max___nv_bfloat16v 560 bytes stack frame, 416 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_Max___nv_bfloat16v 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_Max___nv_bfloat16v 504 bytes stack frame, 1008 bytes spill stores, 1580 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_Max___nv_bfloat16v 560 bytes stack frame, 408 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_Max___nv_bfloat16v 360 bytes stack frame, 572 bytes spill stores, 788 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Max___nv_bfloat16v 520 bytes stack frame, 952 bytes spill stores, 1388 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_Max___nv_bfloat16v 416 bytes stack frame, 716 bytes spill stores, 1100 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Max___nv_bfloat16v 240 bytes stack frame, 264 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Max___nv_bfloat16v 568 bytes stack frame, 1320 bytes spill stores, 2000 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_Max___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_Max___nv_bfloat16v 320 bytes stack frame, 352 bytes spill stores, 612 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_Max___nv_bfloat16v 504 bytes stack frame, 360 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_Max___nv_bfloat16v 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_Max___nv_bfloat16v 472 bytes stack frame, 888 bytes spill stores, 1296 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_Max___nv_bfloat16v 480 bytes stack frame, 288 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_Max___nv_bfloat16v 352 bytes stack frame, 540 bytes spill stores, 748 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=2 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min___nv_bfloat16v 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min___nv_bfloat16v 448 bytes stack frame, 968 bytes spill stores, 1188 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_Min___nv_bfloat16v 240 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_Min___nv_bfloat16v 480 bytes stack frame, 344 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_Min___nv_bfloat16v 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_Min___nv_bfloat16v 296 bytes stack frame, 372 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_Min___nv_bfloat16v 680 bytes stack frame, 272 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_Min___nv_bfloat16v 336 bytes stack frame, 564 bytes spill stores, 716 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min___nv_bfloat16v 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min___nv_bfloat16v 448 bytes stack frame, 968 bytes spill stores, 1188 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_Min___nv_bfloat16v 240 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_Min___nv_bfloat16v 480 bytes stack frame, 344 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_Min___nv_bfloat16v 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_Min___nv_bfloat16v 296 bytes stack frame, 372 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_Min___nv_bfloat16v 680 bytes stack frame, 272 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_Min___nv_bfloat16v 336 bytes stack frame, 564 bytes spill stores, 716 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min___nv_bfloat16v 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min___nv_bfloat16v 448 bytes stack frame, 968 bytes spill stores, 1188 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_Min___nv_bfloat16v 240 bytes stack frame, 260 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_Min___nv_bfloat16v 480 bytes stack frame, 344 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_Min___nv_bfloat16v 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_Min___nv_bfloat16v 296 bytes stack frame, 372 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_Min___nv_bfloat16v 680 bytes stack frame, 272 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_Min___nv_bfloat16v 336 bytes stack frame, 564 bytes spill stores, 716 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min___nv_bfloat16v 208 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min___nv_bfloat16v 488 bytes stack frame, 992 bytes spill stores, 1220 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_Min___nv_bfloat16v 280 bytes stack frame, 308 bytes spill stores, 420 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_Min___nv_bfloat16v 536 bytes stack frame, 400 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_Min___nv_bfloat16v 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_Min___nv_bfloat16v 320 bytes stack frame, 432 bytes spill stores, 612 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_Min___nv_bfloat16v 712 bytes stack frame, 308 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_Min___nv_bfloat16v 360 bytes stack frame, 584 bytes spill stores, 804 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min___nv_bfloat16v 256 bytes stack frame, 276 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min___nv_bfloat16v 576 bytes stack frame, 1284 bytes spill stores, 2044 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_Min___nv_bfloat16v 400 bytes stack frame, 476 bytes spill stores, 1012 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_Min___nv_bfloat16v 560 bytes stack frame, 416 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_Min___nv_bfloat16v 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_Min___nv_bfloat16v 504 bytes stack frame, 1008 bytes spill stores, 1580 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_Min___nv_bfloat16v 560 bytes stack frame, 408 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_Min___nv_bfloat16v 360 bytes stack frame, 572 bytes spill stores, 788 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Min___nv_bfloat16v 520 bytes stack frame, 952 bytes spill stores, 1388 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_Min___nv_bfloat16v 416 bytes stack frame, 716 bytes spill stores, 1100 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Min___nv_bfloat16v 240 bytes stack frame, 264 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Min___nv_bfloat16v 568 bytes stack frame, 1320 bytes spill stores, 2000 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_Min___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_Min___nv_bfloat16v 320 bytes stack frame, 352 bytes spill stores, 612 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_Min___nv_bfloat16v 504 bytes stack frame, 360 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_Min___nv_bfloat16v 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_Min___nv_bfloat16v 472 bytes stack frame, 888 bytes spill stores, 1296 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_Min___nv_bfloat16v 480 bytes stack frame, 288 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_Min___nv_bfloat16v 352 bytes stack frame, 540 bytes spill stores, 748 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling reduce_scatter.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 0 bytes gmem ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum___nv_bfloat16v 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z71ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z70ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z72ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z71ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z68ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum___nv_bfloat16v 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_RING_LL128_PreMulSum___nv_bfloat16v 504 bytes stack frame, 364 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_RING_LL_PreMulSum___nv_bfloat16v 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_TREE_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_TREE_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum___nv_bfloat16v 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z71ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z70ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z72ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z71ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z68ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum___nv_bfloat16v 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_RING_LL128_PreMulSum___nv_bfloat16v 504 bytes stack frame, 364 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_RING_LL_PreMulSum___nv_bfloat16v 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_TREE_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_TREE_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum___nv_bfloat16v 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z71ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z70ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z72ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z71ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z68ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum___nv_bfloat16v 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_RING_LL128_PreMulSum___nv_bfloat16v 504 bytes stack frame, 364 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_RING_LL_PreMulSum___nv_bfloat16v 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_TREE_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_TREE_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum___nv_bfloat16v 232 bytes stack frame, 232 bytes spill stores, 232 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z71ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z70ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z72ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z71ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z68ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum___nv_bfloat16v 264 bytes stack frame, 292 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_RING_LL128_PreMulSum___nv_bfloat16v 560 bytes stack frame, 428 bytes spill stores, 548 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_RING_LL_PreMulSum___nv_bfloat16v 184 bytes stack frame, 180 bytes spill stores, 180 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_TREE_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_TREE_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum___nv_bfloat16v 288 bytes stack frame, 308 bytes spill stores, 308 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z71ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z70ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z72ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z71ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z68ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum___nv_bfloat16v 408 bytes stack frame, 448 bytes spill stores, 784 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_RING_LL128_PreMulSum___nv_bfloat16v 568 bytes stack frame, 440 bytes spill stores, 568 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_RING_LL_PreMulSum___nv_bfloat16v 176 bytes stack frame, 176 bytes spill stores, 176 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_TREE_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_TREE_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 0 bytes gmem ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_NVLS_TREE_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_ReduceScatter_NVLS_TREE_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_ReduceScatter_NVLS_TREE_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_NVLS_SIMPLE_PreMulSum___nv_bfloat16v 240 bytes stack frame, 256 bytes spill stores, 244 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_NVLS_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_NVLS_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z71ncclFunction_ReduceScatter_COLLNET_CHAIN_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z70ncclFunction_ReduceScatter_COLLNET_CHAIN_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_ReduceScatter_COLLNET_CHAIN_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z72ncclFunction_ReduceScatter_COLLNET_DIRECT_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z71ncclFunction_ReduceScatter_COLLNET_DIRECT_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z68ncclFunction_ReduceScatter_COLLNET_DIRECT_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_RING_SIMPLE_PreMulSum___nv_bfloat16v 296 bytes stack frame, 320 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_RING_LL128_PreMulSum___nv_bfloat16v 520 bytes stack frame, 392 bytes spill stores, 468 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_RING_LL_PreMulSum___nv_bfloat16v 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_ReduceScatter_TREE_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_ReduceScatter_TREE_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_ReduceScatter_TREE_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=0 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z51ncclKernel_AllReduce_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z51ncclKernel_AllReduce_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z46ncclKernel_AllReduce_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z55ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z55ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z56ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z56ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z46ncclKernel_AllReduce_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_50' ptxas info : Function properties for _Z46ncclKernel_AllReduce_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 160 bytes stack frame, 344 bytes spill stores, 552 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum___nv_bfloat16v 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum___nv_bfloat16v 448 bytes stack frame, 928 bytes spill stores, 1100 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_Sum___nv_bfloat16v 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_Sum___nv_bfloat16v 480 bytes stack frame, 344 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_Sum___nv_bfloat16v 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_Sum___nv_bfloat16v 296 bytes stack frame, 372 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_Sum___nv_bfloat16v 680 bytes stack frame, 272 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_Sum___nv_bfloat16v 336 bytes stack frame, 564 bytes spill stores, 716 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z51ncclKernel_AllReduce_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z51ncclKernel_AllReduce_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z46ncclKernel_AllReduce_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z55ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z55ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z56ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z56ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z46ncclKernel_AllReduce_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_60' ptxas info : Function properties for _Z46ncclKernel_AllReduce_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 160 bytes stack frame, 344 bytes spill stores, 552 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum___nv_bfloat16v 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum___nv_bfloat16v 448 bytes stack frame, 928 bytes spill stores, 1100 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_Sum___nv_bfloat16v 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_Sum___nv_bfloat16v 480 bytes stack frame, 344 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_Sum___nv_bfloat16v 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_Sum___nv_bfloat16v 296 bytes stack frame, 372 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_Sum___nv_bfloat16v 680 bytes stack frame, 272 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_Sum___nv_bfloat16v 336 bytes stack frame, 564 bytes spill stores, 716 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z51ncclKernel_AllReduce_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z51ncclKernel_AllReduce_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z46ncclKernel_AllReduce_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z55ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z55ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z56ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z56ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 344 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z46ncclKernel_AllReduce_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_61' ptxas info : Function properties for _Z46ncclKernel_AllReduce_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 160 bytes stack frame, 344 bytes spill stores, 552 bytes spill loads ptxas info : Used 96 registers, 344 bytes cmem[0], 12 bytes cmem[2] ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum___nv_bfloat16v 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum___nv_bfloat16v 448 bytes stack frame, 928 bytes spill stores, 1100 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_Sum___nv_bfloat16v 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_Sum___nv_bfloat16v 480 bytes stack frame, 344 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_Sum___nv_bfloat16v 144 bytes stack frame, 144 bytes spill stores, 144 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_Sum___nv_bfloat16v 296 bytes stack frame, 372 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_Sum___nv_bfloat16v 680 bytes stack frame, 272 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_Sum___nv_bfloat16v 336 bytes stack frame, 564 bytes spill stores, 716 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z51ncclKernel_AllReduce_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z51ncclKernel_AllReduce_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z46ncclKernel_AllReduce_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z55ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z55ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z56ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z56ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z46ncclKernel_AllReduce_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_70' ptxas info : Function properties for _Z46ncclKernel_AllReduce_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 168 bytes stack frame, 336 bytes spill stores, 572 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum___nv_bfloat16v 208 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum___nv_bfloat16v 488 bytes stack frame, 992 bytes spill stores, 1220 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_Sum___nv_bfloat16v 280 bytes stack frame, 304 bytes spill stores, 380 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_Sum___nv_bfloat16v 536 bytes stack frame, 400 bytes spill stores, 516 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_Sum___nv_bfloat16v 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_Sum___nv_bfloat16v 320 bytes stack frame, 432 bytes spill stores, 612 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_Sum___nv_bfloat16v 712 bytes stack frame, 308 bytes spill stores, 488 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_Sum___nv_bfloat16v 360 bytes stack frame, 584 bytes spill stores, 804 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z51ncclKernel_AllReduce_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z51ncclKernel_AllReduce_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z46ncclKernel_AllReduce_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z55ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z55ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z56ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z56ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z46ncclKernel_AllReduce_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_80' ptxas info : Function properties for _Z46ncclKernel_AllReduce_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 160 bytes stack frame, 328 bytes spill stores, 564 bytes spill loads ptxas info : Used 96 registers, 376 bytes cmem[0] ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum___nv_bfloat16v 256 bytes stack frame, 276 bytes spill stores, 328 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum___nv_bfloat16v 576 bytes stack frame, 1288 bytes spill stores, 2068 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_Sum___nv_bfloat16v 400 bytes stack frame, 476 bytes spill stores, 1012 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_Sum___nv_bfloat16v 560 bytes stack frame, 416 bytes spill stores, 544 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_Sum___nv_bfloat16v 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_Sum___nv_bfloat16v 528 bytes stack frame, 1036 bytes spill stores, 1596 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_Sum___nv_bfloat16v 560 bytes stack frame, 408 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_Sum___nv_bfloat16v 360 bytes stack frame, 572 bytes spill stores, 788 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Compiling entry function '_Z51ncclKernel_AllReduce_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z51ncclKernel_AllReduce_NVLS_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z46ncclKernel_AllReduce_NVLS_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z55ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z55ncclKernel_AllReduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z56ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z56ncclKernel_AllReduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z46ncclKernel_AllReduce_RING_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 96 registers ptxas info : Compiling entry function '_Z46ncclKernel_AllReduce_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork' for 'sm_90' ptxas info : Function properties for _Z46ncclKernel_AllReduce_TREE_LL_Sum___nv_bfloat16P11ncclDevCommmP8ncclWork 136 bytes stack frame, 276 bytes spill stores, 488 bytes spill loads ptxas info : Used 96 registers ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_Sum___nv_bfloat16v 520 bytes stack frame, 952 bytes spill stores, 1388 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_Sum___nv_bfloat16v 416 bytes stack frame, 716 bytes spill stores, 1100 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum___nv_bfloat16v 240 bytes stack frame, 264 bytes spill stores, 324 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum___nv_bfloat16v 568 bytes stack frame, 1320 bytes spill stores, 2000 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_Sum___nv_bfloat16v 320 bytes stack frame, 352 bytes spill stores, 612 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_Sum___nv_bfloat16v 504 bytes stack frame, 360 bytes spill stores, 496 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_Sum___nv_bfloat16v 184 bytes stack frame, 184 bytes spill stores, 184 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_Sum___nv_bfloat16v 472 bytes stack frame, 888 bytes spill stores, 1296 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_Sum___nv_bfloat16v 480 bytes stack frame, 288 bytes spill stores, 260 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_Sum___nv_bfloat16v 352 bytes stack frame, 540 bytes spill stores, 748 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_i8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=0 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_i8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_i8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int8_tv 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int8_tv 504 bytes stack frame, 1288 bytes spill stores, 1908 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_int8_tv 320 bytes stack frame, 404 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_SumPostDiv_int8_tv 488 bytes stack frame, 360 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_SumPostDiv_int8_tv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_int8_tv 536 bytes stack frame, 1096 bytes spill stores, 1680 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_SumPostDiv_int8_tv 816 bytes stack frame, 416 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_SumPostDiv_int8_tv 352 bytes stack frame, 580 bytes spill stores, 744 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int8_tv 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int8_tv 504 bytes stack frame, 1288 bytes spill stores, 1908 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_int8_tv 320 bytes stack frame, 404 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_SumPostDiv_int8_tv 480 bytes stack frame, 336 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_SumPostDiv_int8_tv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_int8_tv 536 bytes stack frame, 1096 bytes spill stores, 1680 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_SumPostDiv_int8_tv 816 bytes stack frame, 416 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_SumPostDiv_int8_tv 352 bytes stack frame, 580 bytes spill stores, 744 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int8_tv 200 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int8_tv 504 bytes stack frame, 1288 bytes spill stores, 1908 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_int8_tv 320 bytes stack frame, 404 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_SumPostDiv_int8_tv 480 bytes stack frame, 336 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_SumPostDiv_int8_tv 160 bytes stack frame, 160 bytes spill stores, 160 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_int8_tv 536 bytes stack frame, 1096 bytes spill stores, 1680 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_SumPostDiv_int8_tv 816 bytes stack frame, 416 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_SumPostDiv_int8_tv 352 bytes stack frame, 580 bytes spill stores, 744 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int8_tv 256 bytes stack frame, 276 bytes spill stores, 356 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int8_tv 552 bytes stack frame, 1220 bytes spill stores, 1692 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_int8_tv 376 bytes stack frame, 448 bytes spill stores, 816 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_SumPostDiv_int8_tv 544 bytes stack frame, 420 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_SumPostDiv_int8_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_int8_tv 560 bytes stack frame, 1188 bytes spill stores, 1760 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_SumPostDiv_int8_tv 856 bytes stack frame, 484 bytes spill stores, 668 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_SumPostDiv_int8_tv 384 bytes stack frame, 580 bytes spill stores, 884 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int8_tv 392 bytes stack frame, 712 bytes spill stores, 1444 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int8_tv 680 bytes stack frame, 2044 bytes spill stores, 3584 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_int8_tv 528 bytes stack frame, 888 bytes spill stores, 1848 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_SumPostDiv_int8_tv 536 bytes stack frame, 396 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_SumPostDiv_int8_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_int8_tv 1080 bytes stack frame, 2668 bytes spill stores, 3428 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_SumPostDiv_int8_tv 856 bytes stack frame, 488 bytes spill stores, 672 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_SumPostDiv_int8_tv 392 bytes stack frame, 580 bytes spill stores, 888 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z56ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_NVLS_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_int8_tv 376 bytes stack frame, 696 bytes spill stores, 1456 bytes spill loads ptxas info : Function properties for _Z60ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_int8_tv 672 bytes stack frame, 2336 bytes spill stores, 4232 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_int8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_int8_tv 432 bytes stack frame, 668 bytes spill stores, 1424 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_RING_LL128_SumPostDiv_int8_tv 496 bytes stack frame, 340 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_RING_LL_SumPostDiv_int8_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_int8_tv 1016 bytes stack frame, 2756 bytes spill stores, 3536 bytes spill loads ptxas info : Function properties for _Z51ncclFunction_AllReduce_TREE_LL128_SumPostDiv_int8_tv 792 bytes stack frame, 388 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z48ncclFunction_AllReduce_TREE_LL_SumPostDiv_int8_tv 368 bytes stack frame, 564 bytes spill stores, 872 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_u8.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=5 -DNCCL_TYPE=1 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_u8.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_u8.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint8_tv 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint8_tv 504 bytes stack frame, 1288 bytes spill stores, 1908 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_uint8_tv 320 bytes stack frame, 404 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_SumPostDiv_uint8_tv 488 bytes stack frame, 360 bytes spill stores, 408 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_SumPostDiv_uint8_tv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_uint8_tv 536 bytes stack frame, 1096 bytes spill stores, 1680 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_SumPostDiv_uint8_tv 816 bytes stack frame, 416 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_SumPostDiv_uint8_tv 352 bytes stack frame, 580 bytes spill stores, 744 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint8_tv 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint8_tv 504 bytes stack frame, 1288 bytes spill stores, 1908 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_uint8_tv 320 bytes stack frame, 404 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_SumPostDiv_uint8_tv 480 bytes stack frame, 336 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_SumPostDiv_uint8_tv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_uint8_tv 536 bytes stack frame, 1096 bytes spill stores, 1680 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_SumPostDiv_uint8_tv 816 bytes stack frame, 416 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_SumPostDiv_uint8_tv 352 bytes stack frame, 580 bytes spill stores, 744 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint8_tv 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint8_tv 504 bytes stack frame, 1288 bytes spill stores, 1908 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_uint8_tv 320 bytes stack frame, 404 bytes spill stores, 616 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_SumPostDiv_uint8_tv 480 bytes stack frame, 336 bytes spill stores, 400 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_SumPostDiv_uint8_tv 160 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_uint8_tv 536 bytes stack frame, 1096 bytes spill stores, 1680 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_SumPostDiv_uint8_tv 816 bytes stack frame, 416 bytes spill stores, 608 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_SumPostDiv_uint8_tv 352 bytes stack frame, 580 bytes spill stores, 744 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint8_tv 256 bytes stack frame, 280 bytes spill stores, 376 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint8_tv 552 bytes stack frame, 1232 bytes spill stores, 1704 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_uint8_tv 384 bytes stack frame, 444 bytes spill stores, 812 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_SumPostDiv_uint8_tv 544 bytes stack frame, 420 bytes spill stores, 552 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_SumPostDiv_uint8_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_uint8_tv 568 bytes stack frame, 1184 bytes spill stores, 1760 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_SumPostDiv_uint8_tv 856 bytes stack frame, 484 bytes spill stores, 668 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_SumPostDiv_uint8_tv 384 bytes stack frame, 580 bytes spill stores, 884 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint8_tv 384 bytes stack frame, 716 bytes spill stores, 1440 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint8_tv 672 bytes stack frame, 2044 bytes spill stores, 3640 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_uint8_tv 520 bytes stack frame, 888 bytes spill stores, 1848 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_SumPostDiv_uint8_tv 536 bytes stack frame, 396 bytes spill stores, 564 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_SumPostDiv_uint8_tv 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_uint8_tv 1080 bytes stack frame, 2668 bytes spill stores, 3428 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_SumPostDiv_uint8_tv 856 bytes stack frame, 488 bytes spill stores, 672 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_SumPostDiv_uint8_tv 392 bytes stack frame, 580 bytes spill stores, 888 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_TREE_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_TREE_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_TREE_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_NVLS_SIMPLE_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_NVLS_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_NVLS_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_SumPostDiv_uint8_tv 368 bytes stack frame, 688 bytes spill stores, 1508 bytes spill loads ptxas info : Function properties for _Z61ncclFunction_AllReduce_COLLNET_CHAIN_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_COLLNET_CHAIN_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_SumPostDiv_uint8_tv 664 bytes stack frame, 2092 bytes spill stores, 3832 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_COLLNET_DIRECT_LL128_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_COLLNET_DIRECT_LL_SumPostDiv_uint8_tv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_RING_SIMPLE_SumPostDiv_uint8_tv 448 bytes stack frame, 668 bytes spill stores, 1416 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_RING_LL128_SumPostDiv_uint8_tv 496 bytes stack frame, 340 bytes spill stores, 448 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_RING_LL_SumPostDiv_uint8_tv 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z53ncclFunction_AllReduce_TREE_SIMPLE_SumPostDiv_uint8_tv 1016 bytes stack frame, 2756 bytes spill stores, 3536 bytes spill loads ptxas info : Function properties for _Z52ncclFunction_AllReduce_TREE_LL128_SumPostDiv_uint8_tv 792 bytes stack frame, 388 bytes spill stores, 556 bytes spill loads ptxas info : Function properties for _Z49ncclFunction_AllReduce_TREE_LL_SumPostDiv_uint8_tv 368 bytes stack frame, 564 bytes spill stores, 872 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Compiling all_reduce.cu > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_bf16.o mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device /usr/local/cuda/bin/nvcc -DNCCL_OP=4 -DNCCL_TYPE=9 -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dc /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_bf16.cu -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_bf16.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' ptxas info : 4 bytes gmem ptxas info : Function properties for _Z63ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum___nv_bfloat16v 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z68ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum___nv_bfloat16v 464 bytes stack frame, 984 bytes spill stores, 1220 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_RING_SIMPLE_PreMulSum___nv_bfloat16v 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_RING_LL128_PreMulSum___nv_bfloat16v 488 bytes stack frame, 352 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_RING_LL_PreMulSum___nv_bfloat16v 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum___nv_bfloat16v 312 bytes stack frame, 452 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_TREE_LL128_PreMulSum___nv_bfloat16v 688 bytes stack frame, 296 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_TREE_LL_PreMulSum___nv_bfloat16v 344 bytes stack frame, 576 bytes spill stores, 732 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z63ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum___nv_bfloat16v 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z68ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum___nv_bfloat16v 464 bytes stack frame, 984 bytes spill stores, 1220 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_RING_SIMPLE_PreMulSum___nv_bfloat16v 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_RING_LL128_PreMulSum___nv_bfloat16v 488 bytes stack frame, 352 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_RING_LL_PreMulSum___nv_bfloat16v 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum___nv_bfloat16v 312 bytes stack frame, 452 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_TREE_LL128_PreMulSum___nv_bfloat16v 680 bytes stack frame, 292 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_TREE_LL_PreMulSum___nv_bfloat16v 344 bytes stack frame, 576 bytes spill stores, 732 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z63ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum___nv_bfloat16v 200 bytes stack frame, 196 bytes spill stores, 196 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z68ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum___nv_bfloat16v 464 bytes stack frame, 984 bytes spill stores, 1220 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_RING_SIMPLE_PreMulSum___nv_bfloat16v 240 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_RING_LL128_PreMulSum___nv_bfloat16v 488 bytes stack frame, 352 bytes spill stores, 404 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_RING_LL_PreMulSum___nv_bfloat16v 152 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum___nv_bfloat16v 312 bytes stack frame, 452 bytes spill stores, 624 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_TREE_LL128_PreMulSum___nv_bfloat16v 680 bytes stack frame, 292 bytes spill stores, 396 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_TREE_LL_PreMulSum___nv_bfloat16v 344 bytes stack frame, 576 bytes spill stores, 732 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z63ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum___nv_bfloat16v 208 bytes stack frame, 208 bytes spill stores, 208 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z68ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum___nv_bfloat16v 496 bytes stack frame, 1016 bytes spill stores, 1312 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_RING_SIMPLE_PreMulSum___nv_bfloat16v 288 bytes stack frame, 312 bytes spill stores, 440 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_RING_LL128_PreMulSum___nv_bfloat16v 528 bytes stack frame, 384 bytes spill stores, 508 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_RING_LL_PreMulSum___nv_bfloat16v 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum___nv_bfloat16v 336 bytes stack frame, 484 bytes spill stores, 640 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_TREE_LL128_PreMulSum___nv_bfloat16v 728 bytes stack frame, 312 bytes spill stores, 492 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_TREE_LL_PreMulSum___nv_bfloat16v 368 bytes stack frame, 584 bytes spill stores, 824 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z63ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum___nv_bfloat16v 256 bytes stack frame, 284 bytes spill stores, 336 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z68ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum___nv_bfloat16v 656 bytes stack frame, 1416 bytes spill stores, 2184 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_RING_SIMPLE_PreMulSum___nv_bfloat16v 384 bytes stack frame, 448 bytes spill stores, 1064 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_RING_LL128_PreMulSum___nv_bfloat16v 568 bytes stack frame, 440 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_RING_LL_PreMulSum___nv_bfloat16v 192 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum___nv_bfloat16v 520 bytes stack frame, 1064 bytes spill stores, 1624 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_TREE_LL128_PreMulSum___nv_bfloat16v 560 bytes stack frame, 408 bytes spill stores, 576 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_TREE_LL_PreMulSum___nv_bfloat16v 368 bytes stack frame, 576 bytes spill stores, 792 bytes spill loads ptxas info : 4 bytes gmem ptxas info : Function properties for _Z63ncclFunction_AllReduce_NVLS_TREE_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z62ncclFunction_AllReduce_NVLS_TREE_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z59ncclFunction_AllReduce_NVLS_TREE_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_NVLS_SIMPLE_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_NVLS_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_NVLS_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_PreMulSum___nv_bfloat16v 240 bytes stack frame, 260 bytes spill stores, 348 bytes spill loads ptxas info : Function properties for _Z66ncclFunction_AllReduce_COLLNET_CHAIN_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z63ncclFunction_AllReduce_COLLNET_CHAIN_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z68ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_PreMulSum___nv_bfloat16v 600 bytes stack frame, 1344 bytes spill stores, 2156 bytes spill loads ptxas info : Function properties for _Z67ncclFunction_AllReduce_COLLNET_DIRECT_LL128_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z64ncclFunction_AllReduce_COLLNET_DIRECT_LL_PreMulSum___nv_bfloat16v 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_RING_SIMPLE_PreMulSum___nv_bfloat16v 312 bytes stack frame, 360 bytes spill stores, 652 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_RING_LL128_PreMulSum___nv_bfloat16v 512 bytes stack frame, 368 bytes spill stores, 540 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_RING_LL_PreMulSum___nv_bfloat16v 192 bytes stack frame, 188 bytes spill stores, 188 bytes spill loads ptxas info : Function properties for _Z58ncclFunction_AllReduce_TREE_SIMPLE_PreMulSum___nv_bfloat16v 448 bytes stack frame, 1044 bytes spill stores, 1592 bytes spill loads ptxas info : Function properties for _Z57ncclFunction_AllReduce_TREE_LL128_PreMulSum___nv_bfloat16v 488 bytes stack frame, 304 bytes spill stores, 304 bytes spill loads ptxas info : Function properties for _Z54ncclFunction_AllReduce_TREE_LL_PreMulSum___nv_bfloat16v 352 bytes stack frame, 540 bytes spill stores, 776 bytes spill loads make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' /usr/local/cuda/bin/nvcc -ccbin g++ -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90 -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -O3 -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -I. -I.. -I/builddir/build/BUILD/nccl-2.18.3-1/build/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" -dlink /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/functions.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/onerank_reduce.o -o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/devlink.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[2]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' Archiving objects > /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/colldevice.a ar cr /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/colldevice.a /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sum_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_prod_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_min_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_max_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_premulsum_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/sendrecv_sumpostdiv_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sum_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_prod_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_min_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_max_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_premulsum_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_reduce_sumpostdiv_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sum_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_prod_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_min_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_max_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_premulsum_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/all_gather_sumpostdiv_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sum_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_prod_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_min_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_max_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_premulsum_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/broadcast_sumpostdiv_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sum_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_prod_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_min_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_max_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_premulsum_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_sumpostdiv_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sum_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_prod_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_min_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_max_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_premulsum_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_i8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_u8.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_i32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_u32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_i64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_u64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_f16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_f32.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_f64.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/reduce_scatter_sumpostdiv_bf16.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/functions.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/onerank_reduce.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/devlink.o make[2]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src/collectives/device' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Linking libnccl.so.2.18.3 > /builddir/build/BUILD/nccl-2.18.3-1/build/lib/libnccl.so.2.18.3 mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/lib g++ -DCUDA_MAJOR=12 -DCUDA_MINOR=1 -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla -I /usr/local/cuda/include -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -O3 -g -Wall -Wextra -DPROFAPI -shared -Wl,--no-as-needed -Wl,-soname,libnccl.so.2 -o /builddir/build/BUILD/nccl-2.18.3-1/build/lib/libnccl.so.2.18.3 /builddir/build/BUILD/nccl-2.18.3-1/build/obj/init.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/init_nvtx.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/channel.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/bootstrap.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/enqueue.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/group.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/debug.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/proxy.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/net.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/cudawrap.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/nvmlwrap.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/ibvsymbols.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/ibvwrap.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/gdrwrap.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/utils.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/argcheck.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/socket.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/shmutils.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/profiler.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/param.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/strongstream.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/ipcsocket.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/p2p.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/shm.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/net.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/net_socket.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/net_ib.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/coll_net.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/nvls.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/sendrecv.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/all_reduce.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/all_gather.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/broadcast.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/reduce.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/reduce_scatter.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/topo.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/paths.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/search.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/connect.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/rings.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/trees.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/tuning.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/xml.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/enhcompat.o /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/colldevice.a -Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/anolis/anolis-hardened-ld -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -Wl,--build-id=sha1 -L/usr/local/cuda/lib64 -lcudart_static -lpthread -lrt -ldl ln -sf libnccl.so.2 /builddir/build/BUILD/nccl-2.18.3-1/build/lib/libnccl.so ln -sf libnccl.so.2.18.3 /builddir/build/BUILD/nccl-2.18.3-1/build/lib/libnccl.so.2 make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' make[1]: Entering directory '/builddir/build/BUILD/nccl-2.18.3-1/src' Archiving libnccl_static.a > /builddir/build/BUILD/nccl-2.18.3-1/build/lib/libnccl_static.a mkdir -p /builddir/build/BUILD/nccl-2.18.3-1/build/lib printf "create /builddir/build/BUILD/nccl-2.18.3-1/build/lib/libnccl_static.a\naddlib /builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/device/colldevice.a\naddmod /builddir/build/BUILD/nccl-2.18.3-1/build/obj/init.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/init_nvtx.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/channel.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/bootstrap.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/enqueue.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/group.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/debug.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/proxy.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/net.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/cudawrap.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/nvmlwrap.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/ibvsymbols.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/ibvwrap.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/gdrwrap.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/utils.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/argcheck.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/socket.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/shmutils.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/profiler.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/param.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/strongstream.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/misc/ipcsocket.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/p2p.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/shm.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/net.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/net_socket.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/net_ib.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/coll_net.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/transport/nvls.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/sendrecv.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/all_reduce.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/all_gather.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/broadcast.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/reduce.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/collectives/reduce_scatter.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/topo.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/paths.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/search.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/connect.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/rings.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/trees.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/tuning.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/graph/xml.o,/builddir/build/BUILD/nccl-2.18.3-1/build/obj/enhcompat.o\nsave\nend" | ar -M make[1]: Leaving directory '/builddir/build/BUILD/nccl-2.18.3-1/src' + RPM_EC=0 ++ jobs -p + exit 0 Executing(%install): /bin/sh -e /var/tmp/rpm-tmp.c6ifjf + umask 022 + cd /builddir/build/BUILD + '[' /builddir/build/BUILDROOT/libnccl-2.18.3-2.cuda12.1.an23.x86_64 '!=' / ']' + rm -rf /builddir/build/BUILDROOT/libnccl-2.18.3-2.cuda12.1.an23.x86_64 ++ dirname /builddir/build/BUILDROOT/libnccl-2.18.3-2.cuda12.1.an23.x86_64 + mkdir -p /builddir/build/BUILDROOT + mkdir /builddir/build/BUILDROOT/libnccl-2.18.3-2.cuda12.1.an23.x86_64 + CFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection' + export CFLAGS + CXXFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection' + export CXXFLAGS + FFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -I/usr/lib64/gfortran/modules' + export FFLAGS + FCFLAGS='-O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -I/usr/lib64/gfortran/modules' + export FCFLAGS + LDFLAGS='-Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/anolis/anolis-hardened-ld -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -Wl,--build-id=sha1 ' + export LDFLAGS + LT_SYS_LIBRARY_PATH=/usr/lib64: + export LT_SYS_LIBRARY_PATH + CC=gcc + export CC + CXX=g++ + export CXX + cd nccl-2.18.3-1 + mkdir -p /builddir/build/BUILDROOT/libnccl-2.18.3-2.cuda12.1.an23.x86_64 + mkdir -p /builddir/build/BUILDROOT/libnccl-2.18.3-2.cuda12.1.an23.x86_64/usr/include + mkdir -p /builddir/build/BUILDROOT/libnccl-2.18.3-2.cuda12.1.an23.x86_64/usr/lib64 + mkdir -p /builddir/build/BUILDROOT/libnccl-2.18.3-2.cuda12.1.an23.x86_64/usr/lib64/pkgconfig + cp -d build/lib/libnccl.so build/lib/libnccl.so.2 build/lib/libnccl.so.2.18.3 build/lib/libnccl_static.a /builddir/build/BUILDROOT/libnccl-2.18.3-2.cuda12.1.an23.x86_64/usr/lib64 + cp build/include/nccl.h build/include/nccl_net.h /builddir/build/BUILDROOT/libnccl-2.18.3-2.cuda12.1.an23.x86_64/usr/include + cp build/lib/pkgconfig/nccl.pc /builddir/build/BUILDROOT/libnccl-2.18.3-2.cuda12.1.an23.x86_64/usr/lib64/pkgconfig/ + /usr/bin/find-debuginfo -j80 --strict-build-id -m -i --build-id-seed 2.18.3-2.cuda12.1.an23 --unique-debug-suffix -2.18.3-2.cuda12.1.an23.x86_64 --unique-debug-src-base libnccl-2.18.3-2.cuda12.1.an23.x86_64 --run-dwz --dwz-low-mem-die-limit 10000000 --dwz-max-die-limit 110000000 -S debugsourcefiles.list /builddir/build/BUILD/nccl-2.18.3-1 extracting debug info from /builddir/build/BUILDROOT/libnccl-2.18.3-2.cuda12.1.an23.x86_64/usr/lib64/libnccl.so.2.18.3 Support for debuginfod is not compiled into GDB. original debug info size: 7936kB, size after compression: 6548kB /usr/bin/sepdebugcrcfix: Updated 1 CRC32s, 0 CRC32s did match. 2685 blocks + /usr/lib/rpm/check-buildroot + /usr/lib/rpm/anolis/brp-ldconfig + COMPRESS='zstd -f --rm -19 -T0' + COMPRESS_EXT=.zst + /usr/lib/rpm/brp-compress + /usr/lib/rpm/anolis/brp-strip-lto /usr/bin/strip + /usr/lib/rpm/brp-strip-static-archive /usr/bin/strip + /usr/lib/rpm/check-rpaths + /usr/lib/rpm/brp-remove-la-files + /usr/lib/rpm/anolis/clean_perl + /usr/lib/rpm/anolis/check_elf_files + /usr/lib/rpm/anolis/brp-mangle-shebangs + /usr/lib/rpm/anolis/remove-info-dir + /usr/lib/rpm/anolis/check-desktop-files + /usr/lib/rpm/anolis/brp-python-bytecompile '' 1 0 + /usr/lib/rpm/anolis/brp-python-hardlink Processing files: libnccl-2.18.3-2.cuda12.1.an23.x86_64 Executing(%license): /bin/sh -e /var/tmp/rpm-tmp.q3rOio + umask 022 + cd /builddir/build/BUILD + cd nccl-2.18.3-1 + LICENSEDIR=/builddir/build/BUILDROOT/libnccl-2.18.3-2.cuda12.1.an23.x86_64/usr/share/licenses/libnccl + export LC_ALL=C + LC_ALL=C + export LICENSEDIR + /usr/bin/mkdir -p /builddir/build/BUILDROOT/libnccl-2.18.3-2.cuda12.1.an23.x86_64/usr/share/licenses/libnccl + cp -pr LICENSE.txt /builddir/build/BUILDROOT/libnccl-2.18.3-2.cuda12.1.an23.x86_64/usr/share/licenses/libnccl + RPM_EC=0 ++ jobs -p + exit 0 Provides: libnccl = 2.18.3-2.cuda12.1.an23 libnccl(x86-64) = 2.18.3-2.cuda12.1.an23 libnccl.so.2()(64bit) Requires(rpmlib): rpmlib(CompressedFileNames) <= 3.0.4-1 rpmlib(FileDigests) <= 4.6.0-1 rpmlib(PayloadFilesHavePrefix) <= 4.0-1 Requires: ld-linux-x86-64.so.2()(64bit) ld-linux-x86-64.so.2(GLIBC_2.3)(64bit) libc.so.6()(64bit) libc.so.6(GLIBC_2.14)(64bit) libc.so.6(GLIBC_2.17)(64bit) libc.so.6(GLIBC_2.2.5)(64bit) libc.so.6(GLIBC_2.3)(64bit) libc.so.6(GLIBC_2.3.2)(64bit) libc.so.6(GLIBC_2.3.3)(64bit) libc.so.6(GLIBC_2.3.4)(64bit) libc.so.6(GLIBC_2.33)(64bit) libc.so.6(GLIBC_2.34)(64bit) libc.so.6(GLIBC_2.4)(64bit) libc.so.6(GLIBC_2.6)(64bit) libc.so.6(GLIBC_2.7)(64bit) libgcc_s.so.1()(64bit) libgcc_s.so.1(GCC_3.0)(64bit) libm.so.6()(64bit) libm.so.6(GLIBC_2.2.5)(64bit) libstdc++.so.6()(64bit) libstdc++.so.6(CXXABI_1.3)(64bit) libstdc++.so.6(GLIBCXX_3.4)(64bit) libstdc++.so.6(GLIBCXX_3.4.11)(64bit) libstdc++.so.6(GLIBCXX_3.4.19)(64bit) rtld(GNU_HASH) Processing files: libnccl-devel-2.18.3-2.cuda12.1.an23.x86_64 Executing(%license): /bin/sh -e /var/tmp/rpm-tmp.3q3eMa + umask 022 + cd /builddir/build/BUILD + cd nccl-2.18.3-1 + LICENSEDIR=/builddir/build/BUILDROOT/libnccl-2.18.3-2.cuda12.1.an23.x86_64/usr/share/licenses/libnccl-devel + export LC_ALL=C + LC_ALL=C + export LICENSEDIR + /usr/bin/mkdir -p /builddir/build/BUILDROOT/libnccl-2.18.3-2.cuda12.1.an23.x86_64/usr/share/licenses/libnccl-devel + cp -pr LICENSE.txt /builddir/build/BUILDROOT/libnccl-2.18.3-2.cuda12.1.an23.x86_64/usr/share/licenses/libnccl-devel + RPM_EC=0 ++ jobs -p + exit 0 Provides: libnccl-devel = 2.18.3-2.cuda12.1.an23 libnccl-devel(x86-64) = 2.18.3-2.cuda12.1.an23 pkgconfig(nccl) = 2.18.3 Requires(rpmlib): rpmlib(CompressedFileNames) <= 3.0.4-1 rpmlib(FileDigests) <= 4.6.0-1 rpmlib(PayloadFilesHavePrefix) <= 4.0-1 Requires: /usr/bin/pkg-config libnccl.so.2()(64bit) Processing files: libnccl-static-2.18.3-2.cuda12.1.an23.x86_64 Executing(%license): /bin/sh -e /var/tmp/rpm-tmp.JEznPt + umask 022 + cd /builddir/build/BUILD + cd nccl-2.18.3-1 + LICENSEDIR=/builddir/build/BUILDROOT/libnccl-2.18.3-2.cuda12.1.an23.x86_64/usr/share/licenses/libnccl-static + export LC_ALL=C + LC_ALL=C + export LICENSEDIR + /usr/bin/mkdir -p /builddir/build/BUILDROOT/libnccl-2.18.3-2.cuda12.1.an23.x86_64/usr/share/licenses/libnccl-static + cp -pr LICENSE.txt /builddir/build/BUILDROOT/libnccl-2.18.3-2.cuda12.1.an23.x86_64/usr/share/licenses/libnccl-static + RPM_EC=0 ++ jobs -p + exit 0 Provides: libnccl-static = 2.18.3-2.cuda12.1.an23 libnccl-static(x86-64) = 2.18.3-2.cuda12.1.an23 Requires(rpmlib): rpmlib(CompressedFileNames) <= 3.0.4-1 rpmlib(FileDigests) <= 4.6.0-1 rpmlib(PayloadFilesHavePrefix) <= 4.0-1 Processing files: libnccl-debugsource-2.18.3-2.cuda12.1.an23.x86_64 Provides: libnccl-debugsource = 2.18.3-2.cuda12.1.an23 libnccl-debugsource(x86-64) = 2.18.3-2.cuda12.1.an23 Requires(rpmlib): rpmlib(CompressedFileNames) <= 3.0.4-1 rpmlib(FileDigests) <= 4.6.0-1 rpmlib(PayloadFilesHavePrefix) <= 4.0-1 Processing files: libnccl-debuginfo-2.18.3-2.cuda12.1.an23.x86_64 Provides: debuginfo(build-id) = 3762d2bb95347c4b4fa31e3eacb30b10c684139c libnccl-debuginfo = 2.18.3-2.cuda12.1.an23 libnccl-debuginfo(x86-64) = 2.18.3-2.cuda12.1.an23 libnccl.so.2.18.3-2.18.3-2.cuda12.1.an23.x86_64.debug()(64bit) Requires(rpmlib): rpmlib(CompressedFileNames) <= 3.0.4-1 rpmlib(FileDigests) <= 4.6.0-1 rpmlib(PayloadFilesHavePrefix) <= 4.0-1 Recommends: libnccl-debugsource(x86-64) = 2.18.3-2.cuda12.1.an23 Checking for unpackaged file(s): /usr/lib/rpm/check-files /builddir/build/BUILDROOT/libnccl-2.18.3-2.cuda12.1.an23.x86_64 Wrote: /builddir/build/RPMS/libnccl-devel-2.18.3-2.cuda12.1.an23.x86_64.rpm Wrote: /builddir/build/RPMS/libnccl-debugsource-2.18.3-2.cuda12.1.an23.x86_64.rpm Wrote: /builddir/build/RPMS/libnccl-debuginfo-2.18.3-2.cuda12.1.an23.x86_64.rpm Wrote: /builddir/build/RPMS/libnccl-2.18.3-2.cuda12.1.an23.x86_64.rpm Wrote: /builddir/build/RPMS/libnccl-static-2.18.3-2.cuda12.1.an23.x86_64.rpm Child return code was: 0